Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- log/20250824-20:14:03.log +0 -0
- log/20250901-01:24:50.log +282 -0
- log/20250901-01:28:46.log +0 -0
- log/20250901-01:38:23.log +1 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/args.json +380 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/added_tokens.json +24 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/args.json +380 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/chat_template.jinja +54 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/config.json +60 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/generation_config.json +9 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/latest +1 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/merges.txt +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00001-of-00004.safetensors +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00002-of-00004.safetensors +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00003-of-00004.safetensors +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00004-of-00004.safetensors +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model.safetensors.index.json +347 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/scheduler.pt +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/special_tokens_map.json +31 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/tokenizer.json +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/tokenizer_config.json +207 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/trainer_state.json +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/training_args.bin +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/vocab.json +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/zero_to_fp32.py +760 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_loss.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_runtime.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_samples_per_second.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_steps_per_second.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_token_acc.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_epoch.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_grad_norm.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_learning_rate.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_loss.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_token_acc.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_total_flos.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_loss.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_runtime.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_samples_per_second.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_steps_per_second.png +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/logging.jsonl +0 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/runs/events.out.tfevents.1756662002.TENCENT64.site.163428.0 +3 -0
- qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/val_dataset.jsonl +9 -0
- qwen2.5-7b-base-1500q-1500q-old-click-4ep/args.json +380 -0
- qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/added_tokens.json +24 -0
- qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/args.json +380 -0
- qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/chat_template.jinja +54 -0
- qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/config.json +60 -0
- qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/generation_config.json +9 -0
.gitattributes
CHANGED
|
@@ -47,3 +47,5 @@ qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json fil
|
|
| 47 |
qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 48 |
qwen2.5-7b-newhot-1369q-1e_6-old-click-1ep/checkpoint-94/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 49 |
qwen2.5-7b-1500q-1500q-600q-old-click-4ep/checkpoint-772/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 47 |
qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 48 |
qwen2.5-7b-newhot-1369q-1e_6-old-click-1ep/checkpoint-94/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 49 |
qwen2.5-7b-1500q-1500q-600q-old-click-4ep/checkpoint-772/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
log/20250824-20:14:03.log
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
log/20250901-01:24:50.log
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B --train_type full --dataset corr_nq_1500q_hotpot_1500q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 4 --per_device_train_batch_size 4 --learning_rate 6e-5 --gradient_accumulation_steps 2 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
|
| 2 |
+
|
| 3 |
+
*****************************************
|
| 4 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 5 |
+
*****************************************
|
| 6 |
+
[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
|
| 7 |
+
[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
|
| 8 |
+
Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
Traceback (most recent call last):
|
| 207 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/utils/versions.py", line 102, in require_version
|
| 208 |
+
got_ver = importlib.metadata.version(pkg)
|
| 209 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/importlib/metadata/__init__.py", line 996, in version
|
| 210 |
+
return distribution(distribution_name).version
|
| 211 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/importlib/metadata/__init__.py", line 969, in distribution
|
| 212 |
+
return Distribution.from_name(distribution_name)
|
| 213 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/importlib/metadata/__init__.py", line 548, in from_name
|
| 214 |
+
raise PackageNotFoundError(name)
|
| 215 |
+
importlib.metadata.PackageNotFoundError: No package metadata was found for deepspeed
|
| 216 |
+
|
| 217 |
+
During handling of the above exception, another exception occurred:
|
| 218 |
+
|
| 219 |
+
Traceback (most recent call last):
|
| 220 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 221 |
+
sft_main()
|
| 222 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 223 |
+
return SwiftSft(args).main()
|
| 224 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 27, in __init__
|
| 225 |
+
super().__init__(args)
|
| 226 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 19, in __init__
|
| 227 |
+
self.args = self._parse_args(args)
|
| 228 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 31, in _parse_args
|
| 229 |
+
args, remaining_argv = parse_args(self.args_class, args)
|
| 230 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/utils/utils.py", line 152, in parse_args
|
| 231 |
+
args, remaining_args = parser.parse_args_into_dataclasses(argv, return_remaining_strings=True)
|
| 232 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
|
| 233 |
+
obj = dtype(**inputs)
|
| 234 |
+
File "<string>", line 323, in __init__
|
| 235 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/train_args.py", line 187, in __post_init__
|
| 236 |
+
self._init_deepspeed()
|
| 237 |
+
File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/train_args.py", line 203, in _init_deepspeed
|
| 238 |
+
require_version('deepspeed')
|
| 239 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/utils/versions.py", line 104, in require_version
|
| 240 |
+
raise importlib.metadata.PackageNotFoundError(
|
| 241 |
+
importlib.metadata.PackageNotFoundError: No package metadata was found for The 'deepspeed' distribution was not found and is required by this application.
|
| 242 |
+
W0901 01:25:41.502000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60110 closing signal SIGTERM
|
| 243 |
+
W0901 01:25:41.502000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60111 closing signal SIGTERM
|
| 244 |
+
W0901 01:25:41.503000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60112 closing signal SIGTERM
|
| 245 |
+
W0901 01:25:41.504000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60113 closing signal SIGTERM
|
| 246 |
+
W0901 01:25:41.505000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60114 closing signal SIGTERM
|
| 247 |
+
W0901 01:25:41.505000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60115 closing signal SIGTERM
|
| 248 |
+
W0901 01:25:41.506000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 60116 closing signal SIGTERM
|
| 249 |
+
E0901 01:25:41.620000 60034 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 7 (pid: 60117) of binary: /data/miniforge/envs/ms-swift/bin/python3.10
|
| 250 |
+
Traceback (most recent call last):
|
| 251 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
| 252 |
+
return _run_code(code, main_globals, None,
|
| 253 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
|
| 254 |
+
exec(code, run_globals)
|
| 255 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
|
| 256 |
+
main()
|
| 257 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
|
| 258 |
+
return f(*args, **kwargs)
|
| 259 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
|
| 260 |
+
run(args)
|
| 261 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
|
| 262 |
+
elastic_launch(
|
| 263 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
|
| 264 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 265 |
+
File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
|
| 266 |
+
raise ChildFailedError(
|
| 267 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 268 |
+
============================================================
|
| 269 |
+
/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
|
| 270 |
+
------------------------------------------------------------
|
| 271 |
+
Failures:
|
| 272 |
+
<NO_OTHER_FAILURES>
|
| 273 |
+
------------------------------------------------------------
|
| 274 |
+
Root Cause (first observed failure):
|
| 275 |
+
[0]:
|
| 276 |
+
time : 2025-09-01_01:25:41
|
| 277 |
+
host : TENCENT64.site
|
| 278 |
+
rank : 7 (local_rank: 7)
|
| 279 |
+
exitcode : 1 (pid: 60117)
|
| 280 |
+
error_file: <N/A>
|
| 281 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 282 |
+
============================================================
|
log/20250901-01:28:46.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
log/20250901-01:38:23.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
test.sh: line 51: egy: command not found
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/args.json
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "epoch",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 4,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 2,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 6e-05,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 4.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 1,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "epoch",
|
| 38 |
+
"save_steps": 500,
|
| 39 |
+
"save_total_limit": null,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"use_ipex": false,
|
| 51 |
+
"bf16": true,
|
| 52 |
+
"fp16": false,
|
| 53 |
+
"fp16_opt_level": "O1",
|
| 54 |
+
"half_precision_backend": "auto",
|
| 55 |
+
"bf16_full_eval": false,
|
| 56 |
+
"fp16_full_eval": false,
|
| 57 |
+
"tf32": null,
|
| 58 |
+
"local_rank": 0,
|
| 59 |
+
"ddp_backend": null,
|
| 60 |
+
"tpu_num_cores": null,
|
| 61 |
+
"tpu_metrics_debug": false,
|
| 62 |
+
"debug": null,
|
| 63 |
+
"dataloader_drop_last": false,
|
| 64 |
+
"eval_steps": 2000.0,
|
| 65 |
+
"dataloader_num_workers": 48,
|
| 66 |
+
"dataloader_prefetch_factor": null,
|
| 67 |
+
"past_index": -1,
|
| 68 |
+
"run_name": "/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908",
|
| 69 |
+
"disable_tqdm": null,
|
| 70 |
+
"remove_unused_columns": true,
|
| 71 |
+
"label_names": null,
|
| 72 |
+
"load_best_model_at_end": false,
|
| 73 |
+
"metric_for_best_model": "loss",
|
| 74 |
+
"greater_is_better": false,
|
| 75 |
+
"ignore_data_skip": false,
|
| 76 |
+
"fsdp": "",
|
| 77 |
+
"fsdp_min_num_params": 0,
|
| 78 |
+
"fsdp_config": null,
|
| 79 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 80 |
+
"accelerator_config": {
|
| 81 |
+
"dispatch_batches": false
|
| 82 |
+
},
|
| 83 |
+
"deepspeed": {
|
| 84 |
+
"fp16": {
|
| 85 |
+
"enabled": "auto",
|
| 86 |
+
"loss_scale": 0,
|
| 87 |
+
"loss_scale_window": 1000,
|
| 88 |
+
"initial_scale_power": 16,
|
| 89 |
+
"hysteresis": 2,
|
| 90 |
+
"min_loss_scale": 1
|
| 91 |
+
},
|
| 92 |
+
"bf16": {
|
| 93 |
+
"enabled": "auto"
|
| 94 |
+
},
|
| 95 |
+
"zero_optimization": {
|
| 96 |
+
"stage": 3,
|
| 97 |
+
"offload_optimizer": {
|
| 98 |
+
"device": "none",
|
| 99 |
+
"pin_memory": true
|
| 100 |
+
},
|
| 101 |
+
"offload_param": {
|
| 102 |
+
"device": "none",
|
| 103 |
+
"pin_memory": true
|
| 104 |
+
},
|
| 105 |
+
"overlap_comm": false,
|
| 106 |
+
"contiguous_gradients": true,
|
| 107 |
+
"sub_group_size": 1000000000.0,
|
| 108 |
+
"reduce_bucket_size": "auto",
|
| 109 |
+
"zero_quantized_weights": false,
|
| 110 |
+
"zero_quantized_gradients": false,
|
| 111 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 112 |
+
"stage3_param_persistence_threshold": "auto",
|
| 113 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 114 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 115 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 116 |
+
},
|
| 117 |
+
"gradient_accumulation_steps": "auto",
|
| 118 |
+
"gradient_clipping": "auto",
|
| 119 |
+
"steps_per_print": 2000,
|
| 120 |
+
"train_batch_size": "auto",
|
| 121 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 122 |
+
"wall_clock_breakdown": false
|
| 123 |
+
},
|
| 124 |
+
"label_smoothing_factor": 0.0,
|
| 125 |
+
"optim": "adamw_torch_fused",
|
| 126 |
+
"optim_args": null,
|
| 127 |
+
"adafactor": false,
|
| 128 |
+
"group_by_length": false,
|
| 129 |
+
"length_column_name": "length",
|
| 130 |
+
"report_to": [
|
| 131 |
+
"tensorboard"
|
| 132 |
+
],
|
| 133 |
+
"ddp_find_unused_parameters": null,
|
| 134 |
+
"ddp_bucket_cap_mb": null,
|
| 135 |
+
"ddp_broadcast_buffers": null,
|
| 136 |
+
"dataloader_pin_memory": true,
|
| 137 |
+
"dataloader_persistent_workers": false,
|
| 138 |
+
"skip_memory_metrics": true,
|
| 139 |
+
"use_legacy_prediction_loop": false,
|
| 140 |
+
"push_to_hub": false,
|
| 141 |
+
"resume_from_checkpoint": null,
|
| 142 |
+
"hub_model_id": null,
|
| 143 |
+
"hub_strategy": "every_save",
|
| 144 |
+
"hub_token": null,
|
| 145 |
+
"hub_private_repo": null,
|
| 146 |
+
"hub_always_push": false,
|
| 147 |
+
"hub_revision": null,
|
| 148 |
+
"gradient_checkpointing": true,
|
| 149 |
+
"gradient_checkpointing_kwargs": null,
|
| 150 |
+
"include_inputs_for_metrics": false,
|
| 151 |
+
"include_for_metrics": [],
|
| 152 |
+
"eval_do_concat_batches": true,
|
| 153 |
+
"fp16_backend": "auto",
|
| 154 |
+
"push_to_hub_model_id": null,
|
| 155 |
+
"push_to_hub_organization": null,
|
| 156 |
+
"push_to_hub_token": null,
|
| 157 |
+
"mp_parameters": "",
|
| 158 |
+
"auto_find_batch_size": false,
|
| 159 |
+
"full_determinism": false,
|
| 160 |
+
"torchdynamo": null,
|
| 161 |
+
"ray_scope": "last",
|
| 162 |
+
"ddp_timeout": 18000000,
|
| 163 |
+
"torch_compile": false,
|
| 164 |
+
"torch_compile_backend": null,
|
| 165 |
+
"torch_compile_mode": null,
|
| 166 |
+
"include_tokens_per_second": false,
|
| 167 |
+
"include_num_input_tokens_seen": false,
|
| 168 |
+
"neftune_noise_alpha": null,
|
| 169 |
+
"optim_target_modules": null,
|
| 170 |
+
"batch_eval_metrics": false,
|
| 171 |
+
"eval_on_start": false,
|
| 172 |
+
"use_liger_kernel": false,
|
| 173 |
+
"liger_kernel_config": null,
|
| 174 |
+
"eval_use_gather_object": false,
|
| 175 |
+
"average_tokens_across_devices": true,
|
| 176 |
+
"sortish_sampler": false,
|
| 177 |
+
"predict_with_generate": false,
|
| 178 |
+
"generation_max_length": null,
|
| 179 |
+
"generation_num_beams": null,
|
| 180 |
+
"generation_config": null,
|
| 181 |
+
"tuner_backend": "peft",
|
| 182 |
+
"vit_gradient_checkpointing": null,
|
| 183 |
+
"router_aux_loss_coef": 0.0,
|
| 184 |
+
"enable_dft_loss": false,
|
| 185 |
+
"check_model": true,
|
| 186 |
+
"acc_strategy": "token",
|
| 187 |
+
"train_dataloader_shuffle": true,
|
| 188 |
+
"max_epochs": null,
|
| 189 |
+
"aligner_lr": null,
|
| 190 |
+
"vit_lr": null,
|
| 191 |
+
"use_logits_to_keep": null,
|
| 192 |
+
"channels": null,
|
| 193 |
+
"ds3_gather_for_generation": true,
|
| 194 |
+
"resume_only_model": false,
|
| 195 |
+
"optimizer": null,
|
| 196 |
+
"loss_type": null,
|
| 197 |
+
"metric": null,
|
| 198 |
+
"eval_use_evalscope": false,
|
| 199 |
+
"eval_dataset": [],
|
| 200 |
+
"eval_dataset_args": null,
|
| 201 |
+
"eval_limit": null,
|
| 202 |
+
"eval_generation_config": null,
|
| 203 |
+
"extra_eval_args": null,
|
| 204 |
+
"use_flash_ckpt": false,
|
| 205 |
+
"model": "Qwen/Qwen2.5-7B",
|
| 206 |
+
"model_type": "qwen2_5",
|
| 207 |
+
"model_revision": null,
|
| 208 |
+
"task_type": "causal_lm",
|
| 209 |
+
"torch_dtype": "bfloat16",
|
| 210 |
+
"attn_impl": null,
|
| 211 |
+
"new_special_tokens": [],
|
| 212 |
+
"num_labels": null,
|
| 213 |
+
"problem_type": null,
|
| 214 |
+
"rope_scaling": null,
|
| 215 |
+
"device_map": null,
|
| 216 |
+
"max_memory": {},
|
| 217 |
+
"max_model_len": null,
|
| 218 |
+
"local_repo_path": null,
|
| 219 |
+
"init_strategy": null,
|
| 220 |
+
"template": "qwen2_5",
|
| 221 |
+
"system": null,
|
| 222 |
+
"max_length": 16240,
|
| 223 |
+
"truncation_strategy": "delete",
|
| 224 |
+
"max_pixels": null,
|
| 225 |
+
"agent_template": null,
|
| 226 |
+
"norm_bbox": null,
|
| 227 |
+
"use_chat_template": true,
|
| 228 |
+
"padding_free": false,
|
| 229 |
+
"padding_side": "right",
|
| 230 |
+
"loss_scale": "default",
|
| 231 |
+
"sequence_parallel_size": 1,
|
| 232 |
+
"response_prefix": null,
|
| 233 |
+
"template_backend": "swift",
|
| 234 |
+
"dataset": [
|
| 235 |
+
"corr_nq_1500q_hotpot_1500q_swift.jsonl"
|
| 236 |
+
],
|
| 237 |
+
"val_dataset": [],
|
| 238 |
+
"split_dataset_ratio": 0.001,
|
| 239 |
+
"dataset_num_proc": 100,
|
| 240 |
+
"load_from_cache_file": true,
|
| 241 |
+
"dataset_shuffle": true,
|
| 242 |
+
"val_dataset_shuffle": false,
|
| 243 |
+
"streaming": false,
|
| 244 |
+
"interleave_prob": null,
|
| 245 |
+
"stopping_strategy": "first_exhausted",
|
| 246 |
+
"shuffle_buffer_size": 1000,
|
| 247 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 248 |
+
"columns": {},
|
| 249 |
+
"strict": false,
|
| 250 |
+
"model_name": null,
|
| 251 |
+
"model_author": null,
|
| 252 |
+
"custom_dataset_info": [],
|
| 253 |
+
"quant_method": null,
|
| 254 |
+
"quant_bits": null,
|
| 255 |
+
"hqq_axis": null,
|
| 256 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 257 |
+
"bnb_4bit_quant_type": "nf4",
|
| 258 |
+
"bnb_4bit_use_double_quant": true,
|
| 259 |
+
"bnb_4bit_quant_storage": null,
|
| 260 |
+
"max_new_tokens": 64,
|
| 261 |
+
"temperature": 0.0,
|
| 262 |
+
"top_k": null,
|
| 263 |
+
"top_p": null,
|
| 264 |
+
"repetition_penalty": null,
|
| 265 |
+
"num_beams": 1,
|
| 266 |
+
"stream": false,
|
| 267 |
+
"stop_words": [],
|
| 268 |
+
"logprobs": false,
|
| 269 |
+
"top_logprobs": null,
|
| 270 |
+
"ckpt_dir": null,
|
| 271 |
+
"lora_modules": [],
|
| 272 |
+
"train_type": "full",
|
| 273 |
+
"adapters": [],
|
| 274 |
+
"external_plugins": [],
|
| 275 |
+
"model_kwargs": {},
|
| 276 |
+
"load_args": false,
|
| 277 |
+
"load_data_args": false,
|
| 278 |
+
"packing": false,
|
| 279 |
+
"packing_length": null,
|
| 280 |
+
"lazy_tokenize": false,
|
| 281 |
+
"cached_dataset": [],
|
| 282 |
+
"custom_register_path": [],
|
| 283 |
+
"use_hf": false,
|
| 284 |
+
"ignore_args_error": false,
|
| 285 |
+
"use_swift_lora": false,
|
| 286 |
+
"freeze_parameters": [],
|
| 287 |
+
"freeze_parameters_regex": null,
|
| 288 |
+
"freeze_parameters_ratio": 0.0,
|
| 289 |
+
"trainable_parameters": [],
|
| 290 |
+
"trainable_parameters_regex": null,
|
| 291 |
+
"freeze_llm": false,
|
| 292 |
+
"freeze_vit": true,
|
| 293 |
+
"freeze_aligner": false,
|
| 294 |
+
"target_modules": [
|
| 295 |
+
"all-linear"
|
| 296 |
+
],
|
| 297 |
+
"target_regex": null,
|
| 298 |
+
"modules_to_save": [],
|
| 299 |
+
"lora_rank": 8,
|
| 300 |
+
"lora_alpha": 32,
|
| 301 |
+
"lora_dropout": 0.05,
|
| 302 |
+
"lora_bias": "none",
|
| 303 |
+
"lora_dtype": null,
|
| 304 |
+
"lorap_lr_ratio": null,
|
| 305 |
+
"use_rslora": false,
|
| 306 |
+
"use_dora": false,
|
| 307 |
+
"lora_ga_batch_size": 2,
|
| 308 |
+
"lora_ga_iters": 2,
|
| 309 |
+
"lora_ga_max_length": 1024,
|
| 310 |
+
"lora_ga_direction": "ArB2r",
|
| 311 |
+
"lora_ga_scale": "stable",
|
| 312 |
+
"lora_ga_stable_gamma": 16,
|
| 313 |
+
"init_weights": true,
|
| 314 |
+
"fourier_n_frequency": 2000,
|
| 315 |
+
"fourier_scaling": 300.0,
|
| 316 |
+
"boft_block_size": 4,
|
| 317 |
+
"boft_block_num": 0,
|
| 318 |
+
"boft_n_butterfly_factor": 1,
|
| 319 |
+
"boft_dropout": 0.0,
|
| 320 |
+
"vera_rank": 256,
|
| 321 |
+
"vera_projection_prng_key": 0,
|
| 322 |
+
"vera_dropout": 0.0,
|
| 323 |
+
"vera_d_initial": 0.1,
|
| 324 |
+
"adapter_act": "gelu",
|
| 325 |
+
"adapter_length": 128,
|
| 326 |
+
"use_galore": false,
|
| 327 |
+
"galore_target_modules": null,
|
| 328 |
+
"galore_rank": 128,
|
| 329 |
+
"galore_update_proj_gap": 50,
|
| 330 |
+
"galore_scale": 1.0,
|
| 331 |
+
"galore_proj_type": "std",
|
| 332 |
+
"galore_optim_per_parameter": false,
|
| 333 |
+
"galore_with_embedding": false,
|
| 334 |
+
"galore_quantization": false,
|
| 335 |
+
"galore_proj_quant": false,
|
| 336 |
+
"galore_proj_bits": 4,
|
| 337 |
+
"galore_proj_group_size": 256,
|
| 338 |
+
"galore_cos_threshold": 0.4,
|
| 339 |
+
"galore_gamma_proj": 2,
|
| 340 |
+
"galore_queue_size": 5,
|
| 341 |
+
"adalora_target_r": 8,
|
| 342 |
+
"adalora_init_r": 12,
|
| 343 |
+
"adalora_tinit": 0,
|
| 344 |
+
"adalora_tfinal": 0,
|
| 345 |
+
"adalora_deltaT": 1,
|
| 346 |
+
"adalora_beta1": 0.85,
|
| 347 |
+
"adalora_beta2": 0.85,
|
| 348 |
+
"adalora_orth_reg_weight": 0.5,
|
| 349 |
+
"llamapro_num_new_blocks": 4,
|
| 350 |
+
"llamapro_num_groups": null,
|
| 351 |
+
"lisa_activated_layers": 0,
|
| 352 |
+
"lisa_step_interval": 20,
|
| 353 |
+
"reft_layer_key": null,
|
| 354 |
+
"reft_layers": null,
|
| 355 |
+
"reft_rank": 4,
|
| 356 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 357 |
+
"reft_args": null,
|
| 358 |
+
"swanlab_token": null,
|
| 359 |
+
"swanlab_project": null,
|
| 360 |
+
"swanlab_workspace": null,
|
| 361 |
+
"swanlab_exp_name": null,
|
| 362 |
+
"swanlab_lark_webhook_url": null,
|
| 363 |
+
"swanlab_lark_secret": null,
|
| 364 |
+
"swanlab_mode": "cloud",
|
| 365 |
+
"add_version": true,
|
| 366 |
+
"create_checkpoint_symlink": false,
|
| 367 |
+
"zero_hpz_partition_size": null,
|
| 368 |
+
"deepspeed_autotp_size": null,
|
| 369 |
+
"early_stop_interval": null,
|
| 370 |
+
"rank": 0,
|
| 371 |
+
"global_world_size": 8,
|
| 372 |
+
"local_world_size": 8,
|
| 373 |
+
"model_suffix": "Qwen2.5-7B",
|
| 374 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
|
| 375 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fa13a615f30>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
|
| 376 |
+
"model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
|
| 377 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 378 |
+
"evaluation_strategy": "epoch",
|
| 379 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=6e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=4.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
|
| 380 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/args.json
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "epoch",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 4,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 2,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 6e-05,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 4.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 1,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "epoch",
|
| 38 |
+
"save_steps": 500,
|
| 39 |
+
"save_total_limit": null,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"use_ipex": false,
|
| 51 |
+
"bf16": true,
|
| 52 |
+
"fp16": false,
|
| 53 |
+
"fp16_opt_level": "O1",
|
| 54 |
+
"half_precision_backend": "auto",
|
| 55 |
+
"bf16_full_eval": false,
|
| 56 |
+
"fp16_full_eval": false,
|
| 57 |
+
"tf32": null,
|
| 58 |
+
"local_rank": 0,
|
| 59 |
+
"ddp_backend": null,
|
| 60 |
+
"tpu_num_cores": null,
|
| 61 |
+
"tpu_metrics_debug": false,
|
| 62 |
+
"debug": null,
|
| 63 |
+
"dataloader_drop_last": false,
|
| 64 |
+
"eval_steps": 2000.0,
|
| 65 |
+
"dataloader_num_workers": 48,
|
| 66 |
+
"dataloader_prefetch_factor": null,
|
| 67 |
+
"past_index": -1,
|
| 68 |
+
"run_name": "/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908",
|
| 69 |
+
"disable_tqdm": null,
|
| 70 |
+
"remove_unused_columns": true,
|
| 71 |
+
"label_names": null,
|
| 72 |
+
"load_best_model_at_end": false,
|
| 73 |
+
"metric_for_best_model": "loss",
|
| 74 |
+
"greater_is_better": false,
|
| 75 |
+
"ignore_data_skip": false,
|
| 76 |
+
"fsdp": "",
|
| 77 |
+
"fsdp_min_num_params": 0,
|
| 78 |
+
"fsdp_config": null,
|
| 79 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 80 |
+
"accelerator_config": {
|
| 81 |
+
"dispatch_batches": false
|
| 82 |
+
},
|
| 83 |
+
"deepspeed": {
|
| 84 |
+
"fp16": {
|
| 85 |
+
"enabled": "auto",
|
| 86 |
+
"loss_scale": 0,
|
| 87 |
+
"loss_scale_window": 1000,
|
| 88 |
+
"initial_scale_power": 16,
|
| 89 |
+
"hysteresis": 2,
|
| 90 |
+
"min_loss_scale": 1
|
| 91 |
+
},
|
| 92 |
+
"bf16": {
|
| 93 |
+
"enabled": "auto"
|
| 94 |
+
},
|
| 95 |
+
"zero_optimization": {
|
| 96 |
+
"stage": 3,
|
| 97 |
+
"offload_optimizer": {
|
| 98 |
+
"device": "none",
|
| 99 |
+
"pin_memory": true
|
| 100 |
+
},
|
| 101 |
+
"offload_param": {
|
| 102 |
+
"device": "none",
|
| 103 |
+
"pin_memory": true
|
| 104 |
+
},
|
| 105 |
+
"overlap_comm": false,
|
| 106 |
+
"contiguous_gradients": true,
|
| 107 |
+
"sub_group_size": 1000000000.0,
|
| 108 |
+
"reduce_bucket_size": "auto",
|
| 109 |
+
"zero_quantized_weights": false,
|
| 110 |
+
"zero_quantized_gradients": false,
|
| 111 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 112 |
+
"stage3_param_persistence_threshold": "auto",
|
| 113 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 114 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 115 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 116 |
+
},
|
| 117 |
+
"gradient_accumulation_steps": "auto",
|
| 118 |
+
"gradient_clipping": "auto",
|
| 119 |
+
"steps_per_print": 2000,
|
| 120 |
+
"train_batch_size": "auto",
|
| 121 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 122 |
+
"wall_clock_breakdown": false
|
| 123 |
+
},
|
| 124 |
+
"label_smoothing_factor": 0.0,
|
| 125 |
+
"optim": "adamw_torch_fused",
|
| 126 |
+
"optim_args": null,
|
| 127 |
+
"adafactor": false,
|
| 128 |
+
"group_by_length": false,
|
| 129 |
+
"length_column_name": "length",
|
| 130 |
+
"report_to": [
|
| 131 |
+
"tensorboard"
|
| 132 |
+
],
|
| 133 |
+
"ddp_find_unused_parameters": null,
|
| 134 |
+
"ddp_bucket_cap_mb": null,
|
| 135 |
+
"ddp_broadcast_buffers": null,
|
| 136 |
+
"dataloader_pin_memory": true,
|
| 137 |
+
"dataloader_persistent_workers": false,
|
| 138 |
+
"skip_memory_metrics": true,
|
| 139 |
+
"use_legacy_prediction_loop": false,
|
| 140 |
+
"push_to_hub": false,
|
| 141 |
+
"resume_from_checkpoint": null,
|
| 142 |
+
"hub_model_id": null,
|
| 143 |
+
"hub_strategy": "every_save",
|
| 144 |
+
"hub_token": null,
|
| 145 |
+
"hub_private_repo": null,
|
| 146 |
+
"hub_always_push": false,
|
| 147 |
+
"hub_revision": null,
|
| 148 |
+
"gradient_checkpointing": true,
|
| 149 |
+
"gradient_checkpointing_kwargs": null,
|
| 150 |
+
"include_inputs_for_metrics": false,
|
| 151 |
+
"include_for_metrics": [],
|
| 152 |
+
"eval_do_concat_batches": true,
|
| 153 |
+
"fp16_backend": "auto",
|
| 154 |
+
"push_to_hub_model_id": null,
|
| 155 |
+
"push_to_hub_organization": null,
|
| 156 |
+
"push_to_hub_token": null,
|
| 157 |
+
"mp_parameters": "",
|
| 158 |
+
"auto_find_batch_size": false,
|
| 159 |
+
"full_determinism": false,
|
| 160 |
+
"torchdynamo": null,
|
| 161 |
+
"ray_scope": "last",
|
| 162 |
+
"ddp_timeout": 18000000,
|
| 163 |
+
"torch_compile": false,
|
| 164 |
+
"torch_compile_backend": null,
|
| 165 |
+
"torch_compile_mode": null,
|
| 166 |
+
"include_tokens_per_second": false,
|
| 167 |
+
"include_num_input_tokens_seen": false,
|
| 168 |
+
"neftune_noise_alpha": null,
|
| 169 |
+
"optim_target_modules": null,
|
| 170 |
+
"batch_eval_metrics": false,
|
| 171 |
+
"eval_on_start": false,
|
| 172 |
+
"use_liger_kernel": false,
|
| 173 |
+
"liger_kernel_config": null,
|
| 174 |
+
"eval_use_gather_object": false,
|
| 175 |
+
"average_tokens_across_devices": true,
|
| 176 |
+
"sortish_sampler": false,
|
| 177 |
+
"predict_with_generate": false,
|
| 178 |
+
"generation_max_length": null,
|
| 179 |
+
"generation_num_beams": null,
|
| 180 |
+
"generation_config": null,
|
| 181 |
+
"tuner_backend": "peft",
|
| 182 |
+
"vit_gradient_checkpointing": null,
|
| 183 |
+
"router_aux_loss_coef": 0.0,
|
| 184 |
+
"enable_dft_loss": false,
|
| 185 |
+
"check_model": true,
|
| 186 |
+
"acc_strategy": "token",
|
| 187 |
+
"train_dataloader_shuffle": true,
|
| 188 |
+
"max_epochs": null,
|
| 189 |
+
"aligner_lr": null,
|
| 190 |
+
"vit_lr": null,
|
| 191 |
+
"use_logits_to_keep": null,
|
| 192 |
+
"channels": null,
|
| 193 |
+
"ds3_gather_for_generation": true,
|
| 194 |
+
"resume_only_model": false,
|
| 195 |
+
"optimizer": null,
|
| 196 |
+
"loss_type": null,
|
| 197 |
+
"metric": null,
|
| 198 |
+
"eval_use_evalscope": false,
|
| 199 |
+
"eval_dataset": [],
|
| 200 |
+
"eval_dataset_args": null,
|
| 201 |
+
"eval_limit": null,
|
| 202 |
+
"eval_generation_config": null,
|
| 203 |
+
"extra_eval_args": null,
|
| 204 |
+
"use_flash_ckpt": false,
|
| 205 |
+
"model": "Qwen/Qwen2.5-7B",
|
| 206 |
+
"model_type": "qwen2_5",
|
| 207 |
+
"model_revision": null,
|
| 208 |
+
"task_type": "causal_lm",
|
| 209 |
+
"torch_dtype": "bfloat16",
|
| 210 |
+
"attn_impl": null,
|
| 211 |
+
"new_special_tokens": [],
|
| 212 |
+
"num_labels": null,
|
| 213 |
+
"problem_type": null,
|
| 214 |
+
"rope_scaling": null,
|
| 215 |
+
"device_map": null,
|
| 216 |
+
"max_memory": {},
|
| 217 |
+
"max_model_len": null,
|
| 218 |
+
"local_repo_path": null,
|
| 219 |
+
"init_strategy": null,
|
| 220 |
+
"template": "qwen2_5",
|
| 221 |
+
"system": null,
|
| 222 |
+
"max_length": 16240,
|
| 223 |
+
"truncation_strategy": "delete",
|
| 224 |
+
"max_pixels": null,
|
| 225 |
+
"agent_template": null,
|
| 226 |
+
"norm_bbox": null,
|
| 227 |
+
"use_chat_template": true,
|
| 228 |
+
"padding_free": false,
|
| 229 |
+
"padding_side": "right",
|
| 230 |
+
"loss_scale": "default",
|
| 231 |
+
"sequence_parallel_size": 1,
|
| 232 |
+
"response_prefix": null,
|
| 233 |
+
"template_backend": "swift",
|
| 234 |
+
"dataset": [
|
| 235 |
+
"corr_nq_1500q_hotpot_1500q_swift.jsonl"
|
| 236 |
+
],
|
| 237 |
+
"val_dataset": [],
|
| 238 |
+
"split_dataset_ratio": 0.001,
|
| 239 |
+
"dataset_num_proc": 100,
|
| 240 |
+
"load_from_cache_file": true,
|
| 241 |
+
"dataset_shuffle": true,
|
| 242 |
+
"val_dataset_shuffle": false,
|
| 243 |
+
"streaming": false,
|
| 244 |
+
"interleave_prob": null,
|
| 245 |
+
"stopping_strategy": "first_exhausted",
|
| 246 |
+
"shuffle_buffer_size": 1000,
|
| 247 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 248 |
+
"columns": {},
|
| 249 |
+
"strict": false,
|
| 250 |
+
"model_name": null,
|
| 251 |
+
"model_author": null,
|
| 252 |
+
"custom_dataset_info": [],
|
| 253 |
+
"quant_method": null,
|
| 254 |
+
"quant_bits": null,
|
| 255 |
+
"hqq_axis": null,
|
| 256 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 257 |
+
"bnb_4bit_quant_type": "nf4",
|
| 258 |
+
"bnb_4bit_use_double_quant": true,
|
| 259 |
+
"bnb_4bit_quant_storage": null,
|
| 260 |
+
"max_new_tokens": 64,
|
| 261 |
+
"temperature": 0.0,
|
| 262 |
+
"top_k": null,
|
| 263 |
+
"top_p": null,
|
| 264 |
+
"repetition_penalty": null,
|
| 265 |
+
"num_beams": 1,
|
| 266 |
+
"stream": false,
|
| 267 |
+
"stop_words": [],
|
| 268 |
+
"logprobs": false,
|
| 269 |
+
"top_logprobs": null,
|
| 270 |
+
"ckpt_dir": null,
|
| 271 |
+
"lora_modules": [],
|
| 272 |
+
"train_type": "full",
|
| 273 |
+
"adapters": [],
|
| 274 |
+
"external_plugins": [],
|
| 275 |
+
"model_kwargs": {},
|
| 276 |
+
"load_args": false,
|
| 277 |
+
"load_data_args": false,
|
| 278 |
+
"packing": false,
|
| 279 |
+
"packing_length": null,
|
| 280 |
+
"lazy_tokenize": false,
|
| 281 |
+
"cached_dataset": [],
|
| 282 |
+
"custom_register_path": [],
|
| 283 |
+
"use_hf": false,
|
| 284 |
+
"ignore_args_error": false,
|
| 285 |
+
"use_swift_lora": false,
|
| 286 |
+
"freeze_parameters": [],
|
| 287 |
+
"freeze_parameters_regex": null,
|
| 288 |
+
"freeze_parameters_ratio": 0.0,
|
| 289 |
+
"trainable_parameters": [],
|
| 290 |
+
"trainable_parameters_regex": null,
|
| 291 |
+
"freeze_llm": false,
|
| 292 |
+
"freeze_vit": true,
|
| 293 |
+
"freeze_aligner": false,
|
| 294 |
+
"target_modules": [
|
| 295 |
+
"all-linear"
|
| 296 |
+
],
|
| 297 |
+
"target_regex": null,
|
| 298 |
+
"modules_to_save": [],
|
| 299 |
+
"lora_rank": 8,
|
| 300 |
+
"lora_alpha": 32,
|
| 301 |
+
"lora_dropout": 0.05,
|
| 302 |
+
"lora_bias": "none",
|
| 303 |
+
"lora_dtype": null,
|
| 304 |
+
"lorap_lr_ratio": null,
|
| 305 |
+
"use_rslora": false,
|
| 306 |
+
"use_dora": false,
|
| 307 |
+
"lora_ga_batch_size": 2,
|
| 308 |
+
"lora_ga_iters": 2,
|
| 309 |
+
"lora_ga_max_length": 1024,
|
| 310 |
+
"lora_ga_direction": "ArB2r",
|
| 311 |
+
"lora_ga_scale": "stable",
|
| 312 |
+
"lora_ga_stable_gamma": 16,
|
| 313 |
+
"init_weights": true,
|
| 314 |
+
"fourier_n_frequency": 2000,
|
| 315 |
+
"fourier_scaling": 300.0,
|
| 316 |
+
"boft_block_size": 4,
|
| 317 |
+
"boft_block_num": 0,
|
| 318 |
+
"boft_n_butterfly_factor": 1,
|
| 319 |
+
"boft_dropout": 0.0,
|
| 320 |
+
"vera_rank": 256,
|
| 321 |
+
"vera_projection_prng_key": 0,
|
| 322 |
+
"vera_dropout": 0.0,
|
| 323 |
+
"vera_d_initial": 0.1,
|
| 324 |
+
"adapter_act": "gelu",
|
| 325 |
+
"adapter_length": 128,
|
| 326 |
+
"use_galore": false,
|
| 327 |
+
"galore_target_modules": null,
|
| 328 |
+
"galore_rank": 128,
|
| 329 |
+
"galore_update_proj_gap": 50,
|
| 330 |
+
"galore_scale": 1.0,
|
| 331 |
+
"galore_proj_type": "std",
|
| 332 |
+
"galore_optim_per_parameter": false,
|
| 333 |
+
"galore_with_embedding": false,
|
| 334 |
+
"galore_quantization": false,
|
| 335 |
+
"galore_proj_quant": false,
|
| 336 |
+
"galore_proj_bits": 4,
|
| 337 |
+
"galore_proj_group_size": 256,
|
| 338 |
+
"galore_cos_threshold": 0.4,
|
| 339 |
+
"galore_gamma_proj": 2,
|
| 340 |
+
"galore_queue_size": 5,
|
| 341 |
+
"adalora_target_r": 8,
|
| 342 |
+
"adalora_init_r": 12,
|
| 343 |
+
"adalora_tinit": 0,
|
| 344 |
+
"adalora_tfinal": 0,
|
| 345 |
+
"adalora_deltaT": 1,
|
| 346 |
+
"adalora_beta1": 0.85,
|
| 347 |
+
"adalora_beta2": 0.85,
|
| 348 |
+
"adalora_orth_reg_weight": 0.5,
|
| 349 |
+
"llamapro_num_new_blocks": 4,
|
| 350 |
+
"llamapro_num_groups": null,
|
| 351 |
+
"lisa_activated_layers": 0,
|
| 352 |
+
"lisa_step_interval": 20,
|
| 353 |
+
"reft_layer_key": null,
|
| 354 |
+
"reft_layers": null,
|
| 355 |
+
"reft_rank": 4,
|
| 356 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 357 |
+
"reft_args": null,
|
| 358 |
+
"swanlab_token": null,
|
| 359 |
+
"swanlab_project": null,
|
| 360 |
+
"swanlab_workspace": null,
|
| 361 |
+
"swanlab_exp_name": null,
|
| 362 |
+
"swanlab_lark_webhook_url": null,
|
| 363 |
+
"swanlab_lark_secret": null,
|
| 364 |
+
"swanlab_mode": "cloud",
|
| 365 |
+
"add_version": true,
|
| 366 |
+
"create_checkpoint_symlink": false,
|
| 367 |
+
"zero_hpz_partition_size": null,
|
| 368 |
+
"deepspeed_autotp_size": null,
|
| 369 |
+
"early_stop_interval": null,
|
| 370 |
+
"rank": 0,
|
| 371 |
+
"global_world_size": 8,
|
| 372 |
+
"local_world_size": 8,
|
| 373 |
+
"model_suffix": "Qwen2.5-7B",
|
| 374 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
|
| 375 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fa13a615f30>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
|
| 376 |
+
"model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
|
| 377 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 378 |
+
"evaluation_strategy": "epoch",
|
| 379 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=6e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=4.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v1-20250901-013908', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
|
| 380 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"eos_token_id": 151643,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 3584,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 18944,
|
| 12 |
+
"layer_types": [
|
| 13 |
+
"full_attention",
|
| 14 |
+
"full_attention",
|
| 15 |
+
"full_attention",
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention"
|
| 41 |
+
],
|
| 42 |
+
"max_position_embeddings": 131072,
|
| 43 |
+
"max_window_layers": 28,
|
| 44 |
+
"model_type": "qwen2",
|
| 45 |
+
"num_attention_heads": 28,
|
| 46 |
+
"num_hidden_layers": 28,
|
| 47 |
+
"num_key_value_heads": 4,
|
| 48 |
+
"pad_token_id": 151643,
|
| 49 |
+
"rms_norm_eps": 1e-06,
|
| 50 |
+
"rope_scaling": null,
|
| 51 |
+
"rope_theta": 1000000.0,
|
| 52 |
+
"sliding_window": null,
|
| 53 |
+
"tie_word_embeddings": false,
|
| 54 |
+
"torch_dtype": "bfloat16",
|
| 55 |
+
"transformers_version": "4.55.4",
|
| 56 |
+
"use_cache": false,
|
| 57 |
+
"use_mrope": false,
|
| 58 |
+
"use_sliding_window": false,
|
| 59 |
+
"vocab_size": 152064
|
| 60 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"eos_token_id": [
|
| 4 |
+
151643,
|
| 5 |
+
151645
|
| 6 |
+
],
|
| 7 |
+
"max_new_tokens": 2048,
|
| 8 |
+
"transformers_version": "4.55.4"
|
| 9 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/latest
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
global_step624
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00001-of-00004.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:060362d19ca624229d19358cda6c22b9081ee996e6408699d87383eb9aab885f
|
| 3 |
+
size 4877660776
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00002-of-00004.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:492e0db2e260a723331abe3eec1d578d76c8b355c530ff39a740aa05492375e3
|
| 3 |
+
size 4932751008
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00003-of-00004.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60d7724f5ee645b2a7fc300ed63573d0616ef78002ec98907656937382e0bb17
|
| 3 |
+
size 4330865200
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model-00004-of-00004.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a1df5b68c8a56dbdcaff2731ad0fa39b3a4289ead62d2c1e2a6d39dbe325eca
|
| 3 |
+
size 1089994880
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_parameters": 333312,
|
| 4 |
+
"total_size": 15231233024
|
| 5 |
+
},
|
| 6 |
+
"weight_map": {
|
| 7 |
+
"lm_head.weight": "model-00004-of-00004.safetensors",
|
| 8 |
+
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
| 9 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 10 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 11 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 12 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 13 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 19 |
+
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 20 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 21 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 22 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 23 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 24 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 25 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 30 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 31 |
+
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 32 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 33 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 34 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 35 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 36 |
+
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 37 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 41 |
+
"model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 42 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 43 |
+
"model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 44 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 45 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 46 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 47 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 48 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 49 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 52 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 53 |
+
"model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 54 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 55 |
+
"model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 56 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 57 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 58 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 59 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 60 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 61 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 63 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 64 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 65 |
+
"model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 66 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 67 |
+
"model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 68 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 69 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 70 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 71 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 72 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 73 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 74 |
+
"model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 75 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 76 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 77 |
+
"model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 78 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 79 |
+
"model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 80 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 81 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 82 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 83 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 84 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 85 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 86 |
+
"model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 87 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 88 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 89 |
+
"model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 90 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 91 |
+
"model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 92 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 93 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 94 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 95 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 96 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 97 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 98 |
+
"model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 99 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 100 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 101 |
+
"model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 102 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 103 |
+
"model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 104 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 105 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 106 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 107 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 108 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 109 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 110 |
+
"model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 111 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 112 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 113 |
+
"model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 114 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 115 |
+
"model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 116 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 117 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 118 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 119 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 120 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 121 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 122 |
+
"model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 123 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 124 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 125 |
+
"model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 126 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 127 |
+
"model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 128 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 129 |
+
"model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 130 |
+
"model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 131 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 132 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 133 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 134 |
+
"model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 135 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 136 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 137 |
+
"model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 138 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 139 |
+
"model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 140 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 141 |
+
"model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 142 |
+
"model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 143 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 144 |
+
"model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 145 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 146 |
+
"model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 147 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 148 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 149 |
+
"model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 150 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 151 |
+
"model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 152 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 153 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 154 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 155 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 156 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 157 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 158 |
+
"model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 159 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 160 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 161 |
+
"model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 162 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 163 |
+
"model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 164 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 165 |
+
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 166 |
+
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 167 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 168 |
+
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 169 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 170 |
+
"model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 171 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 172 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 173 |
+
"model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 174 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 175 |
+
"model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 176 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 177 |
+
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 178 |
+
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 179 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 180 |
+
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 181 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 182 |
+
"model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 183 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 184 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 185 |
+
"model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 186 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 187 |
+
"model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 188 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 189 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 190 |
+
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 191 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 192 |
+
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 193 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 194 |
+
"model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 195 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 196 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 197 |
+
"model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 198 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 199 |
+
"model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 200 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 201 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 202 |
+
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 203 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 204 |
+
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 205 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 206 |
+
"model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 207 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 208 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 209 |
+
"model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 210 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 211 |
+
"model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 212 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 213 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 214 |
+
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 215 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 216 |
+
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 217 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 218 |
+
"model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 219 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 220 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 221 |
+
"model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 222 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 223 |
+
"model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 224 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 225 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 226 |
+
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 227 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 228 |
+
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 229 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 230 |
+
"model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 231 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 232 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 233 |
+
"model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 234 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 235 |
+
"model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 236 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 237 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 238 |
+
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 239 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 240 |
+
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 241 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 242 |
+
"model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 243 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 244 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 245 |
+
"model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 246 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 247 |
+
"model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 248 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 249 |
+
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 250 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 251 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 252 |
+
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 253 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 254 |
+
"model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 255 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 256 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 257 |
+
"model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 258 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 259 |
+
"model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 260 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 261 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 262 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 263 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 264 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 265 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 266 |
+
"model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 267 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 268 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 269 |
+
"model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 270 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 271 |
+
"model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 272 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 273 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 274 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 275 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 276 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 277 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 278 |
+
"model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 279 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 280 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 281 |
+
"model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 282 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 283 |
+
"model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 284 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 285 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 286 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 287 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 288 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 289 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 290 |
+
"model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 291 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 292 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 293 |
+
"model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 294 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 295 |
+
"model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 296 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 297 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 298 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 299 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 300 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 301 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 302 |
+
"model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 303 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 304 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 305 |
+
"model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 306 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 307 |
+
"model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 308 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 309 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 310 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 311 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 312 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 313 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 314 |
+
"model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 315 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 316 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 317 |
+
"model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 318 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 319 |
+
"model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 320 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 321 |
+
"model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 322 |
+
"model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 323 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 324 |
+
"model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 325 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 326 |
+
"model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 327 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 328 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 329 |
+
"model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 330 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 331 |
+
"model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 332 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 333 |
+
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 334 |
+
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 335 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 336 |
+
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 337 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 338 |
+
"model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 339 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 340 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 341 |
+
"model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 342 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 343 |
+
"model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 344 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 345 |
+
"model.norm.weight": "model-00003-of-00004.safetensors"
|
| 346 |
+
}
|
| 347 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15cbd8f93a88f99710560a94ea88cf61004ed1f50e9208ef60baf230d04a2806
|
| 3 |
+
size 1465
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|endoftext|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/tokenizer_config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|endoftext|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
+
"split_special_tokens": false,
|
| 205 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
+
"unk_token": null
|
| 207 |
+
}
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9dfcdb61e0d90e7243daf4c031e826d456417aeff8498d85c65cf0bce5bea790
|
| 3 |
+
size 9041
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/checkpoint-624/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
# Copyright (c) Microsoft Corporation.
|
| 4 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 5 |
+
|
| 6 |
+
# DeepSpeed Team
|
| 7 |
+
|
| 8 |
+
# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
|
| 9 |
+
# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
|
| 10 |
+
# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
|
| 11 |
+
# application.
|
| 12 |
+
#
|
| 13 |
+
# example:
|
| 14 |
+
# python zero_to_fp32.py . output_dir/
|
| 15 |
+
# or
|
| 16 |
+
# python zero_to_fp32.py . output_dir/ --safe_serialization
|
| 17 |
+
|
| 18 |
+
import argparse
|
| 19 |
+
import torch
|
| 20 |
+
import glob
|
| 21 |
+
import math
|
| 22 |
+
import os
|
| 23 |
+
import re
|
| 24 |
+
import gc
|
| 25 |
+
import json
|
| 26 |
+
import numpy as np
|
| 27 |
+
from tqdm import tqdm
|
| 28 |
+
from collections import OrderedDict
|
| 29 |
+
from dataclasses import dataclass
|
| 30 |
+
|
| 31 |
+
# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
|
| 32 |
+
# DeepSpeed data structures it has to be available in the current python environment.
|
| 33 |
+
from deepspeed.utils import logger
|
| 34 |
+
from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
|
| 35 |
+
FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
|
| 36 |
+
FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class zero_model_state:
|
| 41 |
+
buffers: dict()
|
| 42 |
+
param_shapes: dict()
|
| 43 |
+
shared_params: list
|
| 44 |
+
ds_version: int
|
| 45 |
+
frozen_param_shapes: dict()
|
| 46 |
+
frozen_param_fragments: dict()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
debug = 0
|
| 50 |
+
|
| 51 |
+
# load to cpu
|
| 52 |
+
device = torch.device('cpu')
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def atoi(text):
|
| 56 |
+
return int(text) if text.isdigit() else text
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def natural_keys(text):
|
| 60 |
+
'''
|
| 61 |
+
alist.sort(key=natural_keys) sorts in human order
|
| 62 |
+
http://nedbatchelder.com/blog/200712/human_sorting.html
|
| 63 |
+
(See Toothy's implementation in the comments)
|
| 64 |
+
'''
|
| 65 |
+
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def get_model_state_file(checkpoint_dir, zero_stage):
|
| 69 |
+
if not os.path.isdir(checkpoint_dir):
|
| 70 |
+
raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
|
| 71 |
+
|
| 72 |
+
# there should be only one file
|
| 73 |
+
if zero_stage <= 2:
|
| 74 |
+
file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
|
| 75 |
+
elif zero_stage == 3:
|
| 76 |
+
file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
|
| 77 |
+
|
| 78 |
+
if not os.path.exists(file):
|
| 79 |
+
raise FileNotFoundError(f"can't find model states file at '{file}'")
|
| 80 |
+
|
| 81 |
+
return file
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_checkpoint_files(checkpoint_dir, glob_pattern):
|
| 85 |
+
# XXX: need to test that this simple glob rule works for multi-node setup too
|
| 86 |
+
ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
|
| 87 |
+
|
| 88 |
+
if len(ckpt_files) == 0:
|
| 89 |
+
raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
|
| 90 |
+
|
| 91 |
+
return ckpt_files
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def get_optim_files(checkpoint_dir):
|
| 95 |
+
return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def get_model_state_files(checkpoint_dir):
|
| 99 |
+
return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def parse_model_states(files):
|
| 103 |
+
zero_model_states = []
|
| 104 |
+
for file in files:
|
| 105 |
+
state_dict = torch.load(file, map_location=device, weights_only=False)
|
| 106 |
+
|
| 107 |
+
if BUFFER_NAMES not in state_dict:
|
| 108 |
+
raise ValueError(f"{file} is not a model state checkpoint")
|
| 109 |
+
buffer_names = state_dict[BUFFER_NAMES]
|
| 110 |
+
if debug:
|
| 111 |
+
print("Found buffers:", buffer_names)
|
| 112 |
+
|
| 113 |
+
# recover just the buffers while restoring them to fp32 if they were saved in fp16
|
| 114 |
+
buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
|
| 115 |
+
param_shapes = state_dict[PARAM_SHAPES]
|
| 116 |
+
|
| 117 |
+
# collect parameters that are included in param_shapes
|
| 118 |
+
param_names = []
|
| 119 |
+
for s in param_shapes:
|
| 120 |
+
for name in s.keys():
|
| 121 |
+
param_names.append(name)
|
| 122 |
+
|
| 123 |
+
# update with frozen parameters
|
| 124 |
+
frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
|
| 125 |
+
if frozen_param_shapes is not None:
|
| 126 |
+
if debug:
|
| 127 |
+
print(f"Found frozen_param_shapes: {frozen_param_shapes}")
|
| 128 |
+
param_names += list(frozen_param_shapes.keys())
|
| 129 |
+
|
| 130 |
+
# handle shared params
|
| 131 |
+
shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
|
| 132 |
+
|
| 133 |
+
ds_version = state_dict.get(DS_VERSION, None)
|
| 134 |
+
|
| 135 |
+
frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
|
| 136 |
+
|
| 137 |
+
z_model_state = zero_model_state(buffers=buffers,
|
| 138 |
+
param_shapes=param_shapes,
|
| 139 |
+
shared_params=shared_params,
|
| 140 |
+
ds_version=ds_version,
|
| 141 |
+
frozen_param_shapes=frozen_param_shapes,
|
| 142 |
+
frozen_param_fragments=frozen_param_fragments)
|
| 143 |
+
zero_model_states.append(z_model_state)
|
| 144 |
+
|
| 145 |
+
return zero_model_states
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def parse_optim_states(files, ds_checkpoint_dir):
|
| 149 |
+
total_files = len(files)
|
| 150 |
+
state_dicts = []
|
| 151 |
+
for f in tqdm(files, desc='Loading checkpoint shards'):
|
| 152 |
+
state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
|
| 153 |
+
# immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
|
| 154 |
+
# and also handle the case where it was already removed by another helper script
|
| 155 |
+
state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
|
| 156 |
+
state_dicts.append(state_dict)
|
| 157 |
+
|
| 158 |
+
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
| 159 |
+
raise ValueError(f"{files[0]} is not a zero checkpoint")
|
| 160 |
+
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
|
| 161 |
+
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
|
| 162 |
+
|
| 163 |
+
# For ZeRO-2 each param group can have different partition_count as data parallelism for expert
|
| 164 |
+
# parameters can be different from data parallelism for non-expert parameters. So we can just
|
| 165 |
+
# use the max of the partition_count to get the dp world_size.
|
| 166 |
+
|
| 167 |
+
if type(world_size) is list:
|
| 168 |
+
world_size = max(world_size)
|
| 169 |
+
|
| 170 |
+
if world_size != total_files:
|
| 171 |
+
raise ValueError(
|
| 172 |
+
f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
|
| 173 |
+
"Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# the groups are named differently in each stage
|
| 177 |
+
if zero_stage <= 2:
|
| 178 |
+
fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
|
| 179 |
+
elif zero_stage == 3:
|
| 180 |
+
fp32_groups_key = FP32_FLAT_GROUPS
|
| 181 |
+
else:
|
| 182 |
+
raise ValueError(f"unknown zero stage {zero_stage}")
|
| 183 |
+
|
| 184 |
+
fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
|
| 185 |
+
return zero_stage, world_size, fp32_flat_groups
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
|
| 189 |
+
"""
|
| 190 |
+
Returns fp32 state_dict reconstructed from ds checkpoint
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
- ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
|
| 194 |
+
|
| 195 |
+
"""
|
| 196 |
+
print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
|
| 197 |
+
|
| 198 |
+
optim_files = get_optim_files(ds_checkpoint_dir)
|
| 199 |
+
zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
|
| 200 |
+
print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
|
| 201 |
+
|
| 202 |
+
model_files = get_model_state_files(ds_checkpoint_dir)
|
| 203 |
+
|
| 204 |
+
zero_model_states = parse_model_states(model_files)
|
| 205 |
+
print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
|
| 206 |
+
|
| 207 |
+
if zero_stage <= 2:
|
| 208 |
+
return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
| 209 |
+
exclude_frozen_parameters)
|
| 210 |
+
elif zero_stage == 3:
|
| 211 |
+
return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
| 212 |
+
exclude_frozen_parameters)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def _zero2_merge_frozen_params(state_dict, zero_model_states):
|
| 216 |
+
if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
|
| 217 |
+
return
|
| 218 |
+
|
| 219 |
+
frozen_param_shapes = zero_model_states[0].frozen_param_shapes
|
| 220 |
+
frozen_param_fragments = zero_model_states[0].frozen_param_fragments
|
| 221 |
+
|
| 222 |
+
if debug:
|
| 223 |
+
num_elem = sum(s.numel() for s in frozen_param_shapes.values())
|
| 224 |
+
print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
|
| 225 |
+
|
| 226 |
+
wanted_params = len(frozen_param_shapes)
|
| 227 |
+
wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
|
| 228 |
+
avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
|
| 229 |
+
print(f'Frozen params: Have {avail_numel} numels to process.')
|
| 230 |
+
print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
|
| 231 |
+
|
| 232 |
+
total_params = 0
|
| 233 |
+
total_numel = 0
|
| 234 |
+
for name, shape in frozen_param_shapes.items():
|
| 235 |
+
total_params += 1
|
| 236 |
+
unpartitioned_numel = shape.numel()
|
| 237 |
+
total_numel += unpartitioned_numel
|
| 238 |
+
|
| 239 |
+
state_dict[name] = frozen_param_fragments[name]
|
| 240 |
+
|
| 241 |
+
if debug:
|
| 242 |
+
print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
|
| 243 |
+
|
| 244 |
+
print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def _has_callable(obj, fn):
|
| 248 |
+
attr = getattr(obj, fn, None)
|
| 249 |
+
return callable(attr)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
|
| 253 |
+
param_shapes = zero_model_states[0].param_shapes
|
| 254 |
+
|
| 255 |
+
# Reconstruction protocol:
|
| 256 |
+
#
|
| 257 |
+
# XXX: document this
|
| 258 |
+
|
| 259 |
+
if debug:
|
| 260 |
+
for i in range(world_size):
|
| 261 |
+
for j in range(len(fp32_flat_groups[0])):
|
| 262 |
+
print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
|
| 263 |
+
|
| 264 |
+
# XXX: memory usage doubles here (zero2)
|
| 265 |
+
num_param_groups = len(fp32_flat_groups[0])
|
| 266 |
+
merged_single_partition_of_fp32_groups = []
|
| 267 |
+
for i in range(num_param_groups):
|
| 268 |
+
merged_partitions = [sd[i] for sd in fp32_flat_groups]
|
| 269 |
+
full_single_fp32_vector = torch.cat(merged_partitions, 0)
|
| 270 |
+
merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
|
| 271 |
+
avail_numel = sum(
|
| 272 |
+
[full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
|
| 273 |
+
|
| 274 |
+
if debug:
|
| 275 |
+
wanted_params = sum([len(shapes) for shapes in param_shapes])
|
| 276 |
+
wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
|
| 277 |
+
# not asserting if there is a mismatch due to possible padding
|
| 278 |
+
print(f"Have {avail_numel} numels to process.")
|
| 279 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 280 |
+
|
| 281 |
+
# params
|
| 282 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 283 |
+
# out-of-core computing solution
|
| 284 |
+
total_numel = 0
|
| 285 |
+
total_params = 0
|
| 286 |
+
for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
|
| 287 |
+
offset = 0
|
| 288 |
+
avail_numel = full_single_fp32_vector.numel()
|
| 289 |
+
for name, shape in shapes.items():
|
| 290 |
+
|
| 291 |
+
unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
|
| 292 |
+
total_numel += unpartitioned_numel
|
| 293 |
+
total_params += 1
|
| 294 |
+
|
| 295 |
+
if debug:
|
| 296 |
+
print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
|
| 297 |
+
state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
|
| 298 |
+
offset += unpartitioned_numel
|
| 299 |
+
|
| 300 |
+
# Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
|
| 301 |
+
# avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
|
| 302 |
+
# paddings performed in the code it's almost impossible to predict the exact numbers w/o the
|
| 303 |
+
# live optimizer object, so we are checking that the numbers are within the right range
|
| 304 |
+
align_to = 2 * world_size
|
| 305 |
+
|
| 306 |
+
def zero2_align(x):
|
| 307 |
+
return align_to * math.ceil(x / align_to)
|
| 308 |
+
|
| 309 |
+
if debug:
|
| 310 |
+
print(f"original offset={offset}, avail_numel={avail_numel}")
|
| 311 |
+
|
| 312 |
+
offset = zero2_align(offset)
|
| 313 |
+
avail_numel = zero2_align(avail_numel)
|
| 314 |
+
|
| 315 |
+
if debug:
|
| 316 |
+
print(f"aligned offset={offset}, avail_numel={avail_numel}")
|
| 317 |
+
|
| 318 |
+
# Sanity check
|
| 319 |
+
if offset != avail_numel:
|
| 320 |
+
raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 321 |
+
|
| 322 |
+
print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
| 326 |
+
exclude_frozen_parameters):
|
| 327 |
+
state_dict = OrderedDict()
|
| 328 |
+
|
| 329 |
+
# buffers
|
| 330 |
+
buffers = zero_model_states[0].buffers
|
| 331 |
+
state_dict.update(buffers)
|
| 332 |
+
if debug:
|
| 333 |
+
print(f"added {len(buffers)} buffers")
|
| 334 |
+
|
| 335 |
+
if not exclude_frozen_parameters:
|
| 336 |
+
_zero2_merge_frozen_params(state_dict, zero_model_states)
|
| 337 |
+
|
| 338 |
+
_zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
|
| 339 |
+
|
| 340 |
+
# recover shared parameters
|
| 341 |
+
for pair in zero_model_states[0].shared_params:
|
| 342 |
+
if pair[1] in state_dict:
|
| 343 |
+
state_dict[pair[0]] = state_dict[pair[1]]
|
| 344 |
+
|
| 345 |
+
return state_dict
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def zero3_partitioned_param_info(unpartitioned_numel, world_size):
|
| 349 |
+
remainder = unpartitioned_numel % world_size
|
| 350 |
+
padding_numel = (world_size - remainder) if remainder else 0
|
| 351 |
+
partitioned_numel = math.ceil(unpartitioned_numel / world_size)
|
| 352 |
+
return partitioned_numel, padding_numel
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
|
| 356 |
+
if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
|
| 357 |
+
return
|
| 358 |
+
|
| 359 |
+
if debug:
|
| 360 |
+
for i in range(world_size):
|
| 361 |
+
num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
|
| 362 |
+
print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
|
| 363 |
+
|
| 364 |
+
frozen_param_shapes = zero_model_states[0].frozen_param_shapes
|
| 365 |
+
wanted_params = len(frozen_param_shapes)
|
| 366 |
+
wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
|
| 367 |
+
avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
|
| 368 |
+
print(f'Frozen params: Have {avail_numel} numels to process.')
|
| 369 |
+
print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
|
| 370 |
+
|
| 371 |
+
total_params = 0
|
| 372 |
+
total_numel = 0
|
| 373 |
+
for name, shape in zero_model_states[0].frozen_param_shapes.items():
|
| 374 |
+
total_params += 1
|
| 375 |
+
unpartitioned_numel = shape.numel()
|
| 376 |
+
total_numel += unpartitioned_numel
|
| 377 |
+
|
| 378 |
+
param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
|
| 379 |
+
state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
|
| 380 |
+
|
| 381 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
| 382 |
+
|
| 383 |
+
if debug:
|
| 384 |
+
print(
|
| 385 |
+
f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
class GatheredTensor:
|
| 392 |
+
"""
|
| 393 |
+
A pseudo tensor that collects partitioned weights.
|
| 394 |
+
It is more memory efficient when there are multiple groups.
|
| 395 |
+
"""
|
| 396 |
+
|
| 397 |
+
def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
|
| 398 |
+
self.flat_groups = flat_groups
|
| 399 |
+
self.flat_groups_offset = flat_groups_offset
|
| 400 |
+
self.offset = offset
|
| 401 |
+
self.partitioned_numel = partitioned_numel
|
| 402 |
+
self.shape = shape
|
| 403 |
+
self.dtype = self.flat_groups[0][0].dtype
|
| 404 |
+
|
| 405 |
+
def contiguous(self):
|
| 406 |
+
"""
|
| 407 |
+
Merge partitioned weights from flat_groups into a single tensor.
|
| 408 |
+
"""
|
| 409 |
+
end_idx = self.offset + self.partitioned_numel
|
| 410 |
+
world_size = len(self.flat_groups)
|
| 411 |
+
pad_flat_param_chunks = []
|
| 412 |
+
|
| 413 |
+
for rank_i in range(world_size):
|
| 414 |
+
# for each rank, we need to collect weights from related group/groups
|
| 415 |
+
flat_groups_at_rank_i = self.flat_groups[rank_i]
|
| 416 |
+
start_group_id = None
|
| 417 |
+
end_group_id = None
|
| 418 |
+
for group_id in range(len(self.flat_groups_offset)):
|
| 419 |
+
if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
|
| 420 |
+
start_group_id = group_id
|
| 421 |
+
if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
|
| 422 |
+
end_group_id = group_id
|
| 423 |
+
break
|
| 424 |
+
# collect weights from related group/groups
|
| 425 |
+
for group_id in range(start_group_id, end_group_id + 1):
|
| 426 |
+
flat_tensor = flat_groups_at_rank_i[group_id]
|
| 427 |
+
start_offset = self.offset - self.flat_groups_offset[group_id]
|
| 428 |
+
end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
|
| 429 |
+
pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
|
| 430 |
+
|
| 431 |
+
# collect weights from all ranks
|
| 432 |
+
pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
|
| 433 |
+
param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
|
| 434 |
+
return param
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
|
| 438 |
+
param_shapes = zero_model_states[0].param_shapes
|
| 439 |
+
avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
|
| 440 |
+
|
| 441 |
+
# Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
|
| 442 |
+
# param, re-consolidating each param, while dealing with padding if any
|
| 443 |
+
|
| 444 |
+
# merge list of dicts, preserving order
|
| 445 |
+
param_shapes = {k: v for d in param_shapes for k, v in d.items()}
|
| 446 |
+
|
| 447 |
+
if debug:
|
| 448 |
+
for i in range(world_size):
|
| 449 |
+
print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
|
| 450 |
+
|
| 451 |
+
wanted_params = len(param_shapes)
|
| 452 |
+
wanted_numel = sum(shape.numel() for shape in param_shapes.values())
|
| 453 |
+
# not asserting if there is a mismatch due to possible padding
|
| 454 |
+
avail_numel = fp32_flat_groups[0].numel() * world_size
|
| 455 |
+
print(f"Trainable params: Have {avail_numel} numels to process.")
|
| 456 |
+
print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
|
| 457 |
+
|
| 458 |
+
# params
|
| 459 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 460 |
+
# out-of-core computing solution
|
| 461 |
+
offset = 0
|
| 462 |
+
total_numel = 0
|
| 463 |
+
total_params = 0
|
| 464 |
+
flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
|
| 465 |
+
for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
|
| 466 |
+
unpartitioned_numel = shape.numel()
|
| 467 |
+
total_numel += unpartitioned_numel
|
| 468 |
+
total_params += 1
|
| 469 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
| 470 |
+
|
| 471 |
+
if debug:
|
| 472 |
+
print(
|
| 473 |
+
f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
# memory efficient tensor
|
| 477 |
+
tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
|
| 478 |
+
state_dict[name] = tensor
|
| 479 |
+
offset += partitioned_numel
|
| 480 |
+
|
| 481 |
+
offset *= world_size
|
| 482 |
+
|
| 483 |
+
# Sanity check
|
| 484 |
+
if offset != avail_numel:
|
| 485 |
+
raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 486 |
+
|
| 487 |
+
print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
|
| 491 |
+
exclude_frozen_parameters):
|
| 492 |
+
state_dict = OrderedDict()
|
| 493 |
+
|
| 494 |
+
# buffers
|
| 495 |
+
buffers = zero_model_states[0].buffers
|
| 496 |
+
state_dict.update(buffers)
|
| 497 |
+
if debug:
|
| 498 |
+
print(f"added {len(buffers)} buffers")
|
| 499 |
+
|
| 500 |
+
if not exclude_frozen_parameters:
|
| 501 |
+
_zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
|
| 502 |
+
|
| 503 |
+
_zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
|
| 504 |
+
|
| 505 |
+
# recover shared parameters
|
| 506 |
+
for pair in zero_model_states[0].shared_params:
|
| 507 |
+
if pair[1] in state_dict:
|
| 508 |
+
state_dict[pair[0]] = state_dict[pair[1]]
|
| 509 |
+
|
| 510 |
+
return state_dict
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
def to_torch_tensor(state_dict, return_empty_tensor=False):
|
| 514 |
+
"""
|
| 515 |
+
Convert state_dict of GatheredTensor to torch tensor
|
| 516 |
+
"""
|
| 517 |
+
torch_state_dict = {}
|
| 518 |
+
converted_tensors = {}
|
| 519 |
+
for name, tensor in state_dict.items():
|
| 520 |
+
tensor_id = id(tensor)
|
| 521 |
+
if tensor_id in converted_tensors: # shared tensors
|
| 522 |
+
shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
|
| 523 |
+
torch_state_dict[name] = shared_tensor
|
| 524 |
+
else:
|
| 525 |
+
converted_tensors[tensor_id] = name
|
| 526 |
+
if return_empty_tensor:
|
| 527 |
+
torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
|
| 528 |
+
else:
|
| 529 |
+
torch_state_dict[name] = tensor.contiguous()
|
| 530 |
+
return torch_state_dict
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
|
| 534 |
+
tag=None,
|
| 535 |
+
exclude_frozen_parameters=False,
|
| 536 |
+
lazy_mode=False):
|
| 537 |
+
"""
|
| 538 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
|
| 539 |
+
``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
|
| 540 |
+
via a model hub.
|
| 541 |
+
|
| 542 |
+
Args:
|
| 543 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder
|
| 544 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
|
| 545 |
+
- ``exclude_frozen_parameters``: exclude frozen parameters
|
| 546 |
+
- ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
|
| 547 |
+
Convert the pesduo tensor to torch tensor by ``.contiguous()``
|
| 548 |
+
|
| 549 |
+
Returns:
|
| 550 |
+
- pytorch ``state_dict``
|
| 551 |
+
|
| 552 |
+
A typical usage might be ::
|
| 553 |
+
|
| 554 |
+
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
| 555 |
+
# do the training and checkpoint saving
|
| 556 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
|
| 557 |
+
model = model.cpu() # move to cpu
|
| 558 |
+
model.load_state_dict(state_dict)
|
| 559 |
+
# submit to model hub or save the model to share with others
|
| 560 |
+
|
| 561 |
+
In this example the ``model`` will no longer be usable in the deepspeed context of the same
|
| 562 |
+
application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 563 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 564 |
+
|
| 565 |
+
If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
|
| 566 |
+
|
| 567 |
+
Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
|
| 568 |
+
You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
|
| 569 |
+
the checkpoint. Or you can load state_dict in lazy mode ::
|
| 570 |
+
|
| 571 |
+
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
| 572 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
|
| 573 |
+
for name, lazy_tensor in state_dict.item():
|
| 574 |
+
tensor = lazy_tensor.contiguous() # to cpu
|
| 575 |
+
print(name, tensor)
|
| 576 |
+
# del tensor to release memory if it no longer in use
|
| 577 |
+
"""
|
| 578 |
+
if tag is None:
|
| 579 |
+
latest_path = os.path.join(checkpoint_dir, 'latest')
|
| 580 |
+
if os.path.isfile(latest_path):
|
| 581 |
+
with open(latest_path, 'r') as fd:
|
| 582 |
+
tag = fd.read().strip()
|
| 583 |
+
else:
|
| 584 |
+
raise ValueError(f"Unable to find 'latest' file at {latest_path}")
|
| 585 |
+
|
| 586 |
+
ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
|
| 587 |
+
|
| 588 |
+
if not os.path.isdir(ds_checkpoint_dir):
|
| 589 |
+
raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
|
| 590 |
+
|
| 591 |
+
state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
|
| 592 |
+
if lazy_mode:
|
| 593 |
+
return state_dict
|
| 594 |
+
else:
|
| 595 |
+
return to_torch_tensor(state_dict)
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
|
| 599 |
+
output_dir,
|
| 600 |
+
max_shard_size="5GB",
|
| 601 |
+
safe_serialization=False,
|
| 602 |
+
tag=None,
|
| 603 |
+
exclude_frozen_parameters=False):
|
| 604 |
+
"""
|
| 605 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
|
| 606 |
+
loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
|
| 607 |
+
|
| 608 |
+
Args:
|
| 609 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 610 |
+
- ``output_dir``: directory to the pytorch fp32 state_dict output files
|
| 611 |
+
- ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
|
| 612 |
+
- ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
|
| 613 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 614 |
+
- ``exclude_frozen_parameters``: exclude frozen parameters
|
| 615 |
+
"""
|
| 616 |
+
|
| 617 |
+
# Dependency pre-check
|
| 618 |
+
if safe_serialization:
|
| 619 |
+
try:
|
| 620 |
+
from safetensors.torch import save_file
|
| 621 |
+
except ImportError:
|
| 622 |
+
print('If you want to use `safe_serialization`, please `pip install safetensors`')
|
| 623 |
+
raise
|
| 624 |
+
if max_shard_size is not None:
|
| 625 |
+
try:
|
| 626 |
+
from huggingface_hub import split_torch_state_dict_into_shards
|
| 627 |
+
except ImportError:
|
| 628 |
+
print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
|
| 629 |
+
raise
|
| 630 |
+
|
| 631 |
+
# Convert zero checkpoint to state_dict
|
| 632 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
|
| 633 |
+
tag,
|
| 634 |
+
exclude_frozen_parameters,
|
| 635 |
+
lazy_mode=True)
|
| 636 |
+
|
| 637 |
+
# Shard the model if it is too big.
|
| 638 |
+
weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
|
| 639 |
+
if max_shard_size is not None:
|
| 640 |
+
filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
|
| 641 |
+
# an memory-efficient approach for sharding
|
| 642 |
+
empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
|
| 643 |
+
state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
|
| 644 |
+
filename_pattern=filename_pattern,
|
| 645 |
+
max_shard_size=max_shard_size)
|
| 646 |
+
else:
|
| 647 |
+
from collections import namedtuple
|
| 648 |
+
StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
|
| 649 |
+
state_dict_split = StateDictSplit(is_sharded=False,
|
| 650 |
+
filename_to_tensors={weights_name: list(state_dict.keys())})
|
| 651 |
+
|
| 652 |
+
# Save the model by shard
|
| 653 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 654 |
+
filename_to_tensors = state_dict_split.filename_to_tensors.items()
|
| 655 |
+
for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
|
| 656 |
+
shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
|
| 657 |
+
shard_state_dict = to_torch_tensor(shard_state_dict)
|
| 658 |
+
output_path = os.path.join(output_dir, shard_file)
|
| 659 |
+
if safe_serialization:
|
| 660 |
+
save_file(shard_state_dict, output_path, metadata={"format": "pt"})
|
| 661 |
+
else:
|
| 662 |
+
torch.save(shard_state_dict, output_path)
|
| 663 |
+
# release the memory of current shard
|
| 664 |
+
for tensor_name in list(shard_state_dict.keys()):
|
| 665 |
+
del state_dict[tensor_name]
|
| 666 |
+
del shard_state_dict[tensor_name]
|
| 667 |
+
del shard_state_dict
|
| 668 |
+
gc.collect()
|
| 669 |
+
|
| 670 |
+
# Save index if sharded
|
| 671 |
+
if state_dict_split.is_sharded:
|
| 672 |
+
index = {
|
| 673 |
+
"metadata": state_dict_split.metadata,
|
| 674 |
+
"weight_map": state_dict_split.tensor_to_filename,
|
| 675 |
+
}
|
| 676 |
+
save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
|
| 677 |
+
save_index_file = os.path.join(output_dir, save_index_file)
|
| 678 |
+
with open(save_index_file, "w", encoding="utf-8") as f:
|
| 679 |
+
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
|
| 680 |
+
f.write(content)
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
|
| 684 |
+
"""
|
| 685 |
+
1. Put the provided model to cpu
|
| 686 |
+
2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
|
| 687 |
+
3. Load it into the provided model
|
| 688 |
+
|
| 689 |
+
Args:
|
| 690 |
+
- ``model``: the model object to update
|
| 691 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 692 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 693 |
+
|
| 694 |
+
Returns:
|
| 695 |
+
- ``model`: modified model
|
| 696 |
+
|
| 697 |
+
Make sure you have plenty of CPU memory available before you call this function. If you don't
|
| 698 |
+
have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
|
| 699 |
+
conveniently placed for you in the checkpoint folder.
|
| 700 |
+
|
| 701 |
+
A typical usage might be ::
|
| 702 |
+
|
| 703 |
+
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
| 704 |
+
model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
|
| 705 |
+
# submit to model hub or save the model to share with others
|
| 706 |
+
|
| 707 |
+
Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
|
| 708 |
+
of the same application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 709 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 710 |
+
|
| 711 |
+
"""
|
| 712 |
+
logger.info(f"Extracting fp32 weights")
|
| 713 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 714 |
+
|
| 715 |
+
logger.info(f"Overwriting model with fp32 weights")
|
| 716 |
+
model = model.cpu()
|
| 717 |
+
model.load_state_dict(state_dict, strict=False)
|
| 718 |
+
|
| 719 |
+
return model
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
if __name__ == "__main__":
|
| 723 |
+
parser = argparse.ArgumentParser()
|
| 724 |
+
parser.add_argument("checkpoint_dir",
|
| 725 |
+
type=str,
|
| 726 |
+
help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
|
| 727 |
+
parser.add_argument("output_dir",
|
| 728 |
+
type=str,
|
| 729 |
+
help="directory to the pytorch fp32 state_dict output files"
|
| 730 |
+
"(e.g. path/checkpoint-12-output/)")
|
| 731 |
+
parser.add_argument(
|
| 732 |
+
"--max_shard_size",
|
| 733 |
+
type=str,
|
| 734 |
+
default="5GB",
|
| 735 |
+
help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
|
| 736 |
+
"lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
|
| 737 |
+
"We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
|
| 738 |
+
"without CPU OOM issues.")
|
| 739 |
+
parser.add_argument(
|
| 740 |
+
"--safe_serialization",
|
| 741 |
+
default=False,
|
| 742 |
+
action='store_true',
|
| 743 |
+
help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
|
| 744 |
+
parser.add_argument("-t",
|
| 745 |
+
"--tag",
|
| 746 |
+
type=str,
|
| 747 |
+
default=None,
|
| 748 |
+
help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
|
| 749 |
+
parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
|
| 750 |
+
parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
|
| 751 |
+
args = parser.parse_args()
|
| 752 |
+
|
| 753 |
+
debug = args.debug
|
| 754 |
+
|
| 755 |
+
convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
|
| 756 |
+
args.output_dir,
|
| 757 |
+
max_shard_size=args.max_shard_size,
|
| 758 |
+
safe_serialization=args.safe_serialization,
|
| 759 |
+
tag=args.tag,
|
| 760 |
+
exclude_frozen_parameters=args.exclude_frozen_parameters)
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_loss.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_runtime.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_samples_per_second.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_steps_per_second.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/eval_token_acc.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_epoch.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_grad_norm.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_learning_rate.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_loss.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_token_acc.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_total_flos.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_loss.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_runtime.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_samples_per_second.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/images/train_train_steps_per_second.png
ADDED
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/logging.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/runs/events.out.tfevents.1756662002.TENCENT64.site.163428.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:981225762ce4dd9ad3552b1e2e355ccb3437cf2aafb0d4057d223677725048e8
|
| 3 |
+
size 174546
|
qwen2.5-7b-base-1500q-15001-old-click-4ep-lr6e-5/val_dataset.jsonl
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which of the following is known for the song Sister Havana: Urge Overkill or Ministry?\nObservation: [1947] RootWebArea 'Search: Sister Havana Urge Overkill' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=Sister+Havana+Urge+Overkill\n\t[1968] textbox \"Search 'Wikipedia'\" required: False\n\t[1974] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1975] button '🏠'\n\t[1976] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1977] button 'Wikipedia'\n\t[1978] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1979] button '🎲'\n\t[1839] StaticText 'Results 1-20 of 20 for \"Sister Havana Urge Overkill\"'\n\t[1987] link 'Saturation (Urge Overkill album)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Saturation_(Urge_Overkill_album)\n\t[1846] StaticText '...\"Sister Havana\" and \"Positive Bleeding\" as singles in the US and Europe. \"Sister Havana\" charted highly on both the modern rock and mainstream rock charts, peaking at numbers 6 and 10, respectively, while \"Positive Bleeding\" became a minor rock radio hit. Saturation Studio album by Urge Overkill Released June 8, 1993 Recorded December 17, 1992–January 9, 1993 Genre Alternative rock Length 70:01 Label Geffen Producer Butcher Bros., Andy Kravitz Urge Overkill chronology Stull (1992) Saturation......'\n\t[1855] StaticText 'from Wikipedia'\n\t[1856] StaticText '635 words'\n\t[1996] link 'Urge Overkill' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Urge_Overkill\n\t[1858] StaticText '...\"Sister Havana\" and their cover of Neil Diamond\\'s \"Girl, You\\'ll Be a Woman Soon\", which was notably used in Quentin Tarantino\\'s Pulp Fiction. Their latest album, Oui, was released in 2022. Urge Overkill Background information Origin Chicago, Illinois, United States Genres Alternative rock, hard rock Years active 1986–1997, 2004–present Labels Touch and Go, Geffen Website www.urgeoverkill.com Members Nash Kato Eddie \"King\" Roeser Nate Arling Adam Arling Past members Johnny \"Blackie Onassis\"......'\n\t[1863] StaticText 'from Wikipedia'\n\t[1864] StaticText '1,430 words'\n\t[2003] link 'Triple J Hottest 100, 1993' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Triple_J_Hottest_100,_1993\n\t[1872] StaticText '1,786 words'\n\t[2010] link 'Hit Machine' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Hit_Machine\n\t[1874] StaticText '...\"Sister Havana\" (3:43) Peter Andre – \"Let\\'s Get It On\" (4:09) Leftfield featuring John Lydon – \"Open Up\" (3:46) Xscape – \"Just Kickin\\' It\" (3:25) Ace of Base – \"Happy Nation\" (3:28) Defryme – \"Pure Killer\" (4:04) Funky Poets – \"Born in the Ghetto\" (4:18) Certifications Region Certification Certified units/sales Australia (ARIA)[7] Platinum 70,000^ ^ Shipments figures based on certification alone. Hit Machine 5 Hit Machine 5 Compilation album by Various artists Released 1994 Genre......'\n\t[1879] StaticText 'from Wikipedia'\n\t[1880] StaticText '5,688 words'\n\t[2017] link 'Reading and Leeds Festivals line-ups' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Reading_and_Leeds_Festivals_line-ups\n\t[1886] StaticText '16,832 words'\n\t[2023] link 'Glastonbury Festival line-ups' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Glastonbury_Festival_line-ups\nHISTORY_ACTION: type [328] [Sister Havana] [1]\ntype [1335] [Sister Havana Urge Overkill] [1]\n"}, {"role": "assistant", "content": "<think>The search results and snippets indicate that \"Sister Havana\" is a song by Urge Overkill. The text mentions \"Sister Havana\" as a single by Urge Overkill, and there is no mention of Ministry in connection with the song. I have enough information to answer the question.</think>\n```stop [Urge Overkill is known for the song \"Sister Havana\".]```"}]}
|
| 2 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: what class of food does egg belong to\nObservation: [1481] RootWebArea 'Search: egg' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=egg\n\t[1502] textbox \"Search 'Wikipedia'\" required: False\n\t[1508] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1509] button '🏠'\n\t[1510] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1511] button 'Wikipedia'\n\t[1512] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1513] button '🎲'\n\t[1033] StaticText 'Results 1-25 of 78,533 for \"egg\"'\n\t[1521] link 'List of egg topics' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_egg_topics\n\t[1040] StaticText '...Egg dishes and List of egg dishes A to Z egg dishes Balut (egg) Bai pong moan Boiled egg Brik Century egg Changua Chinese steamed eggs Çılbır Coddled egg Custard Deviled egg Devonshire eggs Egg bhurji Egg drop soup Egg foo young Egg in the basket Egg khagina Egg salad Egg sandwich Eggs Benedict Eggs Neptune Eggs Sardou Eggs and brains French toast Fried egg Fritaja Frittata Haminados Hangtown fry Huevos divorciados[lower-alpha 1] Huevos motuleños[lower-alpha 1] Huevos rancheros[lower-alpha 1]......'\n\t[1073] StaticText 'from Wikipedia'\n\t[1074] StaticText '907 words'\n\t[1542] link 'Boiled egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Boiled_egg\n\t[1076] StaticText '...egg white and egg yolk both solidify, while soft-boiled eggs may leave the yolk, and sometimes the white, at least partially liquid and raw. Boiled eggs are a popular breakfast food around the world. For eggs cooked without their shells in hot water, see poached egg. For other egg dishes, see list of egg dishes. Boiled egg A soft-boiled egg served in the half shell Main ingredients Eggs (typically chicken) Variations Baked eggs, starting temperature, preparation Food energy (per 100\\xa0g serving)......'\n\t[1091] StaticText 'from Wikipedia'\n\t[1092] StaticText '2,185 words'\n\t[1554] link 'Egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Egg\n\t[1094] StaticText 'Egg An egg is the organic vessel containing the zygote in which an embryo develops until it can survive on its own, at which point the animal hatches. An egg results from fertilization of an egg cell. Most arthropods, vertebrates (excluding live-bearing mammals), and mollusks lay eggs, although some, such as scorpions, do not. This article is about biological eggs. For eggs as food, see Egg as food. For other uses, see Egg (disambiguation). Eggs of various birds, a reptile, various cartilaginous......'\n\t[1106] StaticText 'from Wikipedia'\n\t[1107] StaticText '5,196 words'\n\t[1565] link 'Easter egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Easter_egg\n\t[1109] StaticText '...egg (media). Decorated Easter eggs Easter egg of the Ukrainian variety with the Paschal greeting \"Christ is Risen!\" A chocolate Easter egg Although eggs, in general, were a traditional symbol of fertility and rebirth,[2] in Christianity, for the celebration of Eastertide, Easter eggs symbolize the empty tomb of Jesus, from which Jesus was resurrected.[3][4][5] In addition, one ancient tradition was the staining of Easter eggs with the colour red \"in memory of the blood of Christ, shed as at that......'\n\t[1116] StaticText 'from Wikipedia'\n\t[1117] StaticText '6,153 words'\n\t[1573] link 'Egg as food' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Egg_as_food\n\t[1119] StaticText '...(egg white), and vitellus (egg yolk), contained within various thin membranes. The most commonly consumed eggs are chicken eggs. Other poultry eggs including those of duck and quail also are eaten. Fish eggs are called roe and caviar. \"Chicken egg\" redirects here. For the causality dilemma, see Chicken or the egg. For Egg (biology), see Egg. For other uses, see Egg (disambiguation). Fried eggs with carrot, Parmesan and cream Egg yolks and whole eggs store significant amounts of protein and......'\n\t[1136] StaticText 'from Wikipedia'\n\t[1137] StaticText '9,695 words'\n\t[1586] link 'List of egg dishes' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_egg_dishes\n\t[1139] StaticText '...(egg yolk), contained within many different thin membranes all surrounded by a protective eggshell. Popular choices for egg consumption are chicken, duck, quail, roe, caviar, and emu, but the '\nHISTORY_ACTION: type [330] [egg] [1]\n"}, {"role": "assistant", "content": "<think>The link titled 'Egg as food' is likely to contain information about the class of food to which eggs belong. Clicking this link should provide the relevant details.</think>\n```click [1573] [Egg as food]```"}]}
|
| 3 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Lawrence D. Cohen's Carrie:The Musical is an adaptation of Stephen King's novel, which centers around a socially awkward teenage girl with what newly discovered powers?\nObservation: [2717] RootWebArea 'Carrie (musical)' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)\n\t[2829] textbox \"Search 'Wikipedia'\" required: False\n\t[2835] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[2836] button '🏠'\n\t[2837] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[2838] button 'Wikipedia'\n\t[2839] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[2840] button '🎲'\n\t[2847] heading 'Carrie (musical)' hasPopup: menu\n\t[1789] StaticText 'Carrie is a '\n\t[2719] link 'musical' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Musical_theatre\n\t[1792] StaticText ' with a book by '\n\t[2854] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t[1794] StaticText ', lyrics by '\n\t[2855] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t[1796] StaticText ', and music by '\n\t[2856] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t[1798] StaticText '. Adapted from '\n\t[2857] link 'Stephen King' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stephen_King\n\t[2859] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(novel)\n\t[1802] StaticText ', it focuses on an awkward teenage girl with '\n\t[2860] link 'telekinetic powers' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Psychokinesis\n\t[1804] StaticText ' whose lonely life is dominated by an oppressive '\n\t[2861] link 'religious fanatic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Religious_fanaticism\n\t[1806] StaticText ' mother. When she is humiliated by her classmates at the high school '\n\t[2862] link 'prom' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Prom\n\t[1808] StaticText ', she unleashes chaos on everyone and everything in her path.'\n\t[2864] table ''\n\t\t[2866] row ''\n\t\t\t[2867] columnheader 'Carrie' required: False\n\t\t[2870] row ''\n\t\t\t[2871] cell 'Official poster for the original Stratford-upon-Avon production'\n\t\t\t\t[2768] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Carrie_the_Musical_Poster.jpg.webp\n\t\t[2874] row ''\n\t\t\t[2875] rowheader 'Music' required: False\n\t\t\t[2876] cell 'Michael Gore'\n\t\t\t\t[2877] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t\t[2878] row ''\n\t\t\t[2879] rowheader 'Lyrics' required: False\n\t\t\t[2880] cell 'Dean Pitchford'\n\t\t\t\t[2881] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t\t[2882] row ''\n\t\t\t[2883] rowheader 'Book' required: False\n\t\t\t\t[2884] link 'Book' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Book_(musical_theatre)\n\t\t\t[2885] cell 'Lawrence D. Cohen'\n\t\t\t\t[2886] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t\t[2887] row ''\n\t\t\t[2888] rowheader 'Basis' required: False\n\t\t\t[2889] cell 'Carrie by Stephen King'\n\t\t\t\t[2890] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(novel)\n\t[1830] StaticText 'Originally premiering in the U.K. in 1988, Carrie opened on '\n\t[2899] link 'Broadway' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Broadway_theatre\n\t[1834] StaticText ' at the Virginia Theatre the same year, but closed after 16 previews and five regular performances. Due to the passionate response from both its critics and its fans, the show is considered one of the most notable failures in '\n\t[2900] link 'Broadway theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Broadway_theatre\n\t[1836] StaticText ' history; a 1991 book written by '\n\t[2901] link 'Ken Mandelbaum' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ken_Mandelbaum\n\t[1838] StaticText ', which chronicled the history of flop Broadway musicals, was partially entitled Not Since Carrie, and a 2021 podcast, Out for Blood, documented its creation and development.'\n\t[2904] group ''\n\t\t[2909] DisclosureTriangle 'Production history' expanded: True\n\t\t\t[2910] heading 'Production history'\n\t\t[1845] StaticText 'Inspired by a 1981 performance of '\n\t\t[2912] link 'Alban Berg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alban_Berg\n\t\t[2914] link 'Lulu' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lulu_(opera)\n\t\t[1849] StaticText ' at the '\n\t\t[2915] link 'Metropolitan Opera House' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Metropolitan_Opera_House_(Lincoln_Center)\n\t\t[2917] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)#cite_note-DVD_featurette-1\n\t\t[2919] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t\t[1854] StaticText ', who wrote the script for the 1976 film version of '\n\t\t[2920] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(1976_film)\n\t\t[1856] StaticText ', and '\n\t\t[2922] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t\t[1858] StaticText ' began work on a musical based on the '\n\t\t[2923] link 'Stephen King' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stephen_King\n\t\t[2925] link 'Fame' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fame_(1980_film)\n\t\t[1862] StaticText ' collaborator, '\n\t\t[2926] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t\t[1864] StaticText ', was brought in to work on the project, which underwent numerous rewrites.'\n\t\t[2928] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)#cite_note-2\n\t\t[1865] StaticText ' In August 1984, a '\n\t\t[2930] link 'workshop' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Workshop\n\t\t[1867] StaticText ' of the first act was staged at 890 Broadway in New York City, with '\n\t\t[2931] link 'Annie Golden' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Annie_Golden\n\t\t[1869] StaticText ' as Carrie, '\n\t\t[2932] link 'Maureen McGovern' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Maureen_McGovern\n\t\t[1871] StaticText ' as Mrs. White, '\n\t\t[2933] link 'Laurie Beechman' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Laurie_Beechman\n\t\t[1873] StaticText ' as Mrs. Gardner, and '\n\t\t[2934] link 'Liz Callaway' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Liz_Callaway\n\t\t[1875] StaticText ' as Chris. It was soon announced that Carrie would be produced on '\nHISTORY_ACTION: type [327] [Carrie (musical)] [1]\nclick [1309] [Carrie (musical)]\n"}, {"role": "assistant", "content": "<think>The objective is to identify what newly discovered powers the main character, Carrie, has in Lawrence D. Cohen's adaptation. The text in the accessibility tree states: \"it focuses on an awkward teenage girl with telekinetic powers whose lonely life is dominated by an oppressive religious fanatic mother.\" This directly answers the question.</think>\n```stop [telekinetic powers]```"}]}
|
| 4 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Name the second world war trilogy written by the English writer, Evelyn Waugh, who is also known for his travel writings in the 1930s and a later travel book called 'A Tourist in Africa' which appeared in 1960?\nObservation: [292] RootWebArea 'User:The other Kiwix guy/Landing' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing\n\t[329] textbox \"Search 'Wikipedia'\" required: False\n\t[335] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[336] button '🏠'\n\t[337] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[338] button 'Wikipedia'\n\t[339] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[340] button '🎲'\n\t[3] StaticText 'Welcome to '\n\t[351] link 'Wikipedia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wikipedia\n\t[5] StaticText 'The free encyclopedia.'\n\t[6] StaticText '6,489,052'\n\t[7] StaticText ' articles in '\n\t[357] link 'English' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/English_Wikipedia\n\t[359] heading 'Arts'\n\t[361] link 'Architecture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Architecture\n\t[362] link 'Books' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Books\n\t[363] link 'Cinematography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cinematography\n\t[364] link 'Dance' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dance\n\t[365] link 'Design' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Design\n\t[366] link 'Fashion' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fashion\n\t[367] link 'Films' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Films\n\t[368] link 'Gastronomy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gastronomy\n\t[369] link 'Literature' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Literature\n\t[370] link 'Magic (illusion)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Magic_(illusion)\n\t[371] link 'Music' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Music\n\t[372] link 'Painting' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Painting\n\t[373] link 'Photography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Photography\n\t[374] link 'Poetry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Poetry\n\t[375] link 'Sculpture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sculpture\n\t[376] link 'Theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Theatre\n\t[378] heading 'Geography'\n\t[380] link 'Africa' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Africa\n\t[381] link 'Antarctica' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Antarctica\n\t[382] link 'Arctic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Arctic\n\t[383] link 'Asia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Asia\n\t[384] link 'Caribbean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Caribbean\n\t[385] link 'Central America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Central_America\n\t[386] link 'Europe' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Europe\n\t[387] link 'Latin America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Latin_America\n\t[388] link 'Mediterranean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mediterranean\n\t[389] link 'Middle East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_East\n\t[390] link 'North America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/North_America\n\t[391] link 'Oceania' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Oceania\n\t[392] link 'South America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/South_America\n\t[393] link 'Cartography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cartography\n\t[395] heading 'History'\n\t[397] link 'Ancient Egypt' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Egypt\n\t[398] link 'Ancient Greece' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Greece\n\t[399] link 'Ancient Near East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Near_East\n\t[400] link 'Ancient Rome' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Rome\n\t[401] link 'Archaeology' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Archaeology\n\t[402] link 'British Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/British_Empire\n\t[294] link 'Byzantine Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Byzantine_Empire\n\t[403] link 'Colonialism' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Colonialism\n\t[404] link 'Crusades' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Crusades\n\t[405] link 'Heraldry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heraldry\n\t[406] link 'History of science' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_science\n\t[407] link 'Imperial China' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_China#Imperial_China\n\t[408] link 'Indian independence movement' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Indian_independence_movement\n\t[409] link 'Japan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_Japan\n\t[410] link 'Middle Ages' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_Ages\n\t[411] link 'Mughal Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mughal_Empire\n\t[412] link 'Ottoman Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ottoman_Empire\n\t[413] link 'Russian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Russian_Empire\n\t[414] link 'Sasanian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sasanian_Empire\n\t[415] link 'Seljuk Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Seljuk_Empire\n\t[416] link 'Soviet Union' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Soviet_Union\n\t[417] link 'War' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/War\n\t[419] heading 'Sciences'\nHISTORY_ACTION: \n\n"}, {"role": "assistant", "content": "<think>The objective is to find the name of the Second World War trilogy written by Evelyn Waugh. The current page has a search textbox with id [329] labeled \"Search 'Wikipedia'\". The most efficient way to find the answer is to search for \"Evelyn Waugh\" in Wikipedia, as his article should mention his major works, including the WWII trilogy. I will type \"Evelyn Waugh\" into the search box and press enter.</think>\n```type [329] [Evelyn Waugh] [1]```"}]}
|
| 5 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which Wisconsin Badgers player won a Heismann trophy and was elected to the Pro Bowl in each of their first four seasons?\nObservation: [13584] RootWebArea 'Alan Ameche' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche\n\t[13622] textbox \"Search 'Wikipedia'\" required: False\n\t[13628] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[13629] button '🏠'\n\t[13630] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[13631] button 'Wikipedia'\n\t[13632] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[13633] button '🎲'\n\t[13640] heading 'Alan Ameche' hasPopup: menu\n\t[13027] StaticText 'Alan Ameche (/əmiːtʃi/; June 1, 1933 – August 8, 1988), nicknamed \"The Iron Horse\", or simply \"The Horse\", was an '\n\t[13655] link 'American football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/American_football\n\t[13041] StaticText ' player who played six seasons with the '\n\t[13656] link 'Baltimore Colts' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_the_Baltimore_Colts\n\t[13043] StaticText ' in the '\n\t[13657] link 'National Football League' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/National_Football_League\n\t[13045] StaticText ' (NFL). He played '\n\t[13658] link 'college football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/College_football\n\t[13047] StaticText ' at the '\n\t[13659] link 'University of Wisconsin–Madison' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wisconsin_Badgers_football\n\t[13049] StaticText ' and won the '\n\t[13660] link 'Heisman Trophy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heisman_Trophy\n\t[13051] StaticText ' during his senior season in '\n\t[13661] link '1954' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1954_Wisconsin_Badgers_football_team\n\t[13053] StaticText '. He was elected to the '\n\t[13662] link 'Pro Bowl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pro_Bowl\n\t[13055] StaticText ' in each of his first four seasons in the league. He is famous for scoring the winning touchdown in overtime in the '\n\t[13663] link '1958 NFL Championship Game' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1958_NFL_Championship_Game\n\t[13057] StaticText ' against the '\n\t[13664] link 'New York Giants' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1958_New_York_Giants_season\n\t[13059] StaticText ', labeled \"The Greatest Game Ever Played.\"'\n\t[13666] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-mscwiot-1\n\t[13669] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-everplyd-2\n\t[13673] table 'Alan Ameche'\n\t\t[13674] caption ''\n\t\t[13676] row ''\n\t\t\t[13677] cell ''\n\t\t\t\t[13585] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Alan_Ameche_1960.jpg.webp\n\t\t[13679] row ''\n\t\t\t[13680] columnheader 'No. 35' required: False\n\t\t[13681] row ''\n\t\t\t[13682] rowheader 'Position:' required: False\n\t\t\t[13683] cell 'Fullback'\n\t\t\t\t[13684] link 'Fullback' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fullback_(American_football)\n\t\t[13685] row ''\n\t\t\t[13686] columnheader 'Personal information' required: False\n\t\t[13687] row ''\n\t\t\t[13688] rowheader 'Born:' required: False\n\t\t\t[13689] cell 'June 1, 1933 Kenosha, Wisconsin, U.S.'\n\t\t\t\t[13692] link 'Kenosha, Wisconsin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Kenosha%2C_Wisconsin\n\t\t[13693] row ''\n\t\t\t[13694] rowheader 'Died:' required: False\n\t\t\t[13695] cell 'August 8, 1988 (aged\\xa055) Houston, Texas, U.S.'\n\t[13189] StaticText 'With colleague and former Colts teammate '\n\t[13812] link 'Gino Marchetti' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gino_Marchetti\n\t[13191] StaticText ', Ameche founded the '\n\t[13587] link \"Gino's Hamburgers\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gino's_Hamburgers\n\t[13193] StaticText ' chain.'\n\t[13814] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-aaatyp-3\n\t[13816] group ''\n\t\t[13821] DisclosureTriangle 'Early life' expanded: True\n\t\t\t[13822] heading 'Early life'\n\t\t[13197] StaticText 'Born in '\n\t\t[13824] link 'Kenosha, Wisconsin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Kenosha%2C_Wisconsin\n\t\t[13199] StaticText ', as Lino Dante Amici to Italian immigrant parents who came to the United States in the late 1920s, although they returned for a year to '\n\t\t[13826] link 'Italy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Italy\n\t\t[13203] StaticText ' during his childhood. The family then returned to Kenosha, where he attended '\n\t\t[13827] link 'Bradford High School' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mary_D._Bradford_High_School\n\t\t[13205] StaticText '. Ameche was a cousin of actor brothers '\n\t\t[13828] link 'Don' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Don_Ameche\n\t\t[13207] StaticText ' and '\n\t\t[13829] link 'Jim Ameche' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Jim_Ameche\n\t\t[13831] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-4\n\t[13833] group ''\n\t\t[13838] DisclosureTriangle 'College career' expanded: True\n\t\t\t[13839] heading 'College career'\n\t\t[13212] StaticText 'Ameche earned consensus '\n\t\t[13841] link 'All-America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1954_College_Football_All-America_Team\n\t\t[13214] StaticText ' honors at the '\n\t\t[13842] link 'University of Wisconsin–Madison' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/University_of_Wisconsin%E2%80%93Madison\n\t\t[13216] StaticText ', where he played '\n\t\t[13843] link 'linebacker' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Linebacker\n\t\t[13218] StaticText ' as well as '\n\t\t[13844] link 'fullback' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fullback_(American_football)\n\t\t[13845] link 'single-platoon' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/One-platoon_system\n\t\t[13222] StaticText ' days. In four years as a '\n\t\t[13846] link 'Badger' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wisconsin_Badgers_football\n\t\t[13847] link 'bowl game' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Bowl_game\n\t\t[13226] StaticText ', the '\n\t\t[13848] link '1953 Rose Bowl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1953_Rose_Bowl\n\t\t[13228] StaticText ', as a sophomore, rushing for 133 yards on 28 carries. Ameche won the '\n\t\t[13849] link 'Heisman Trophy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heisman_Trophy\nHISTORY_ACTION: type [328] [Wisconsin Badgers Heisman Trophy winners] [1]\nclick [1391] [Wisconsin Badgers football]\nclick [4874] [Alan Ameche]\n"}, {"role": "assistant", "content": "<think>The page provides detailed information about Alan Ameche, including that he won the Heisman Trophy while at the University of Wisconsin–Madison and was elected to the Pro Bowl in each of his first four NFL seasons. This matches the user's query exactly.</think>\n```stop [Alan Ameche is the Wisconsin Badgers player who won a Heisman Trophy and was elected to the Pro Bowl in each of his first four seasons.]```"}]}
|
| 6 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who is the main character in that 70s show\nObservation: [1388] RootWebArea \"That '70s Show\" focused: True\n\t[1913] textbox \"Search 'Wikipedia'\" required: False\n\t[1915] link 'Go to welcome page'\n\t\t[1921] button '🏠'\n\t[1916] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1923] button 'Wikipedia'\n\t[1917] link 'Go to a randomly selected page'\n\t\t[1925] button '🎲'\n\t[1927] heading \"That '70s Show\" hasPopup: menu\n\t[1944] link 'period'\n\t[1946] link 'sitcom'\n\t[1947] StaticText ' that aired on '\n\t[1948] link 'Fox'\n\t[1949] StaticText ' from August 23, 1998, to May 18, 2006. The series focuses on the lives of a group of six teenage friends living in fictional Point Place, Wisconsin, from May 17, 1976, to December 31, 1979.'\n\t[1408] link '[]'\n\t[1599] table ''\n\t\t[2729] row ''\n\t\t\t[2748] columnheader \"That '70s Show\" required: False\n\t\t[2730] row ''\n\t\t\t[2749] gridcell '' required: False\n\t\t[1601] row ''\n\t\t\t[2750] rowheader 'Genre' required: False\n\t\t\t[1602] gridcell 'Period sitcom Teen sitcom' required: False\n\t\t\t\t[1606] ListMarker '■ '\n\t\t\t\t[3222] link 'Period'\n\t\t\t\t[3224] link 'sitcom'\n\t\t\t\t[1608] ListMarker '■ '\n\t\t\t\t[3227] link 'Teen sitcom'\n\t\t[1609] row ''\n\t\t\t[2751] rowheader 'Created by' required: False\n\t\t\t[1610] gridcell 'Bonnie Turner Terry Turner Mark Brazill' required: False\n\t\t\t\t[1614] ListMarker '■ '\n\t\t\t\t[3230] link 'Bonnie Turner'\n\t\t\t\t[1616] ListMarker '■ '\n\t\t\t\t[3232] link 'Terry Turner'\n\t\t\t\t[1618] ListMarker '■ '\n\t\t\t\t[3234] link 'Mark Brazill'\n\t\t[1619] row ''\n\t\t\t[2752] rowheader 'Directed by' required: False\n\t\t\t[1620] gridcell 'David Trainer Terry Hughes (pilot)' required: False\n\t\t\t\t[1624] ListMarker '■ '\n\t\t\t\t[3237] link 'David Trainer'\n\t\t\t\t[1626] ListMarker '■ '\n\t\t\t\t[3239] link 'Terry Hughes'\n\t\t\t\t[3240] StaticText ' (pilot)'\n\t\t[1627] row ''\n\t\t\t[2753] rowheader 'Starring' required: False\n\t\t\t[1628] gridcell 'Topher Grace Mila Kunis Ashton Kutcher Danny Masterson Laura Prepon Wilmer Valderrama Debra Jo Rupp Kurtwood Smith Tanya Roberts Don Stark Lisa Robin Kelly Tommy Chong Josh Meyers' required: False\n\t\t\t\t[1632] ListMarker '■ '\n\t\t\t\t[3243] link 'Topher Grace'\n\t\t\t\t[1634] ListMarker '■ '\n\t\t\t\t[3245] link 'Mila Kunis'\n\t\t\t\t[1636] ListMarker '■ '\n\t\t\t\t[3247] link 'Ashton Kutcher'\n\t[1950] StaticText 'The main teenage cast members were '\n\t[1951] link 'Topher Grace'\n\t[1953] link 'Mila Kunis'\n\t[1955] link 'Ashton Kutcher'\n\t[1957] link 'Danny Masterson'\n\t[1959] link 'Laura Prepon'\n\t[1961] link 'Wilmer Valderrama'\n\t[1962] StaticText ', and '\n\t[1963] link 'Lisa Robin Kelly'\n\t[1964] StaticText '. The main adult cast members were '\n\t[1965] link 'Debra Jo Rupp'\n\t[1967] link 'Kurtwood Smith'\n\t[1969] link 'Don Stark'\n\t[1971] link 'Tommy Chong'\n\t[1972] StaticText ', and '\n\t[1973] link 'Tanya Roberts'\n\t[1975] StaticText 'In 1999, the show was remade by the '\n\t[1976] link 'ITV'\n\t[1977] StaticText ' network in the United Kingdom as '\n\t[1979] link 'Days Like These'\n\t[1980] StaticText ' using almost '\n\t[1981] link 'verbatim'\n\t[1982] StaticText ' scripts with minor changes to cultural references.'\n\t[1412] link '[]'\n\t[1414] Details ''\n\t\t[1694] DisclosureTriangle 'Cast' expanded: True\n\t\t\t[1983] heading 'Cast'\n\t\t[1985] note ''\n\t\t\t[1986] StaticText 'See also: '\n\t\t\t[1987] link \"List of That '70s, '80s, and '90s Show characters\"\n\t\t[1416] Details ''\n\t\t\t[1697] DisclosureTriangle 'Young cast' expanded: True\n\t\t\t\t[2805] heading 'Young cast'\n\t\t\t[1700] ListMarker '■ '\n\t\t\t[3376] link 'Topher Grace'\n\t\t\t[3378] link 'Eric Forman'\n\t\t\t[3379] StaticText ' (seasons 1–7; special guest season 8): Eric is a nice guy, geeky, physically slight and somewhat clumsy. He is a smart-ass with a fast wit and a deadpan sense of humor. His obsession with movies, particularly '\n\t\t\t[3381] link 'Star Wars'\n\t\t\t[3382] StaticText ', is often referenced throughout the show. For seven seasons Eric is in a relationship with his longtime love and neighbor '\n\t\t\t[3383] link 'Donna Pinciotti'\n\t\t\t[3384] StaticText '. His father, Red, is always hard on him. He convinces his parents to let his best friend '\n\t\t\t[3385] link 'Steven Hyde'\n\t\t\t[3386] StaticText ' move in with them, making Hyde like a brother. He decides to become a teacher after high school and he leaves the series at the end of the seventh season to teach in '\nHISTORY_ACTION: type [21] [That '70s Show] [1]\nclick [574] [link \"That '70s Show\"]\n"}, {"role": "assistant", "content": "<think>The section under \"Young cast\" describes Eric Forman, played by Topher Grace, in detail and refers to his central role, indicating he is the main character. I can now answer the question.</think>\n```stop [The main character in That '70s Show is Eric Forman, played by Topher Grace.]```"}]}
|
| 7 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who voices the main character in the lego movie\nObservation: [3271] RootWebArea 'The Lego Movie (franchise)' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie_(franchise)\n\t[3310] textbox \"Search 'Wikipedia'\" required: False\n\t[3316] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[3317] button '🏠'\n\t[3318] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[3319] button 'Wikipedia'\n\t[3320] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[3321] button '🎲'\n\t[3328] heading 'The Lego Movie (franchise)' hasPopup: menu\n\t[2722] StaticText 'The Lego Movie is an American '\n\t[3335] link 'media franchise' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Media_franchise\n\t[2725] StaticText ' based on '\n\t[3336] link 'Lego' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lego\n\t[3337] link 'construction toys' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Construction_toy\n\t[2729] StaticText '. It began with the 2014 film '\n\t[3339] link 'The Lego Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie\n\t[2731] StaticText ', which was directed and written by '\n\t[3340] link 'Phil Lord and Christopher Miller' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Phil_Lord_and_Christopher_Miller\n\t[2733] StaticText '. The success of the film led to the release of two licensed video games, a '\n\t[3341] link '4D film theme park attraction' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie%3A_4D_%E2%80%93_A_New_Adventure\n\t[2735] StaticText ', two spin-off films titled '\n\t[3343] link 'The Lego Batman Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Batman_Movie\n\t[2737] StaticText ' and '\n\t[3345] link 'The Lego Ninjago Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Ninjago_Movie\n\t[2739] StaticText ', which were released in 2017, '\n\t[3347] link 'Unikitty!' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Unikitty!\n\t[2741] StaticText ' an animated television series that also came out in the same year, and the sequel to the original film titled '\n\t[3349] link 'The Lego Movie 2: The Second Part' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie_2%3A_The_Second_Part\n\t[2743] StaticText ' in 2019. Plans for a third spin-off film and a sequel to The Lego Batman Movie were later shelved. The franchise has received a generally positive critical reception, with the exception of The Lego Ninjago Movie which received mixed reviews, and has grossed $1 billion at the worldwide box office. While '\n\t[3352] link 'Warner Bros.' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Warner_Bros.\n\t[2749] StaticText ' still owned the rights to their run of the franchise from 2014 to 2020, including the Unikitty! animated television series, '\n\t[3354] link 'Universal Pictures' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Universal_Pictures\n\t[3358] table ''\n\t\t[3360] row ''\n\t\t\t[3361] columnheader 'The Lego Movie' required: False\n\t\t[3362] row ''\n\t\t\t[3363] cell 'Logo to the first film.'\n\t\t\t\t[3365] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/The_Lego_Movie_logo.png.webp\n\t\t[3367] row ''\n\t\t\t[3368] rowheader 'Created by' required: False\n\t\t\t[3369] cell 'Dan Lin Dan Hageman Kevin Hageman Roy Lee Phil Lord Christopher Miller'\n\t\t\t\t[2756] ListMarker '■ '\n\t\t\t\t[3373] link 'Dan Lin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dan_Lin\n\t\t\t\t[2757] ListMarker '■'\n\t\t\t\t[3375] link 'Dan Hageman Kevin Hageman' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Hageman_Brothers\n\t\t\t\t\t[2758] ListMarker '■ '\n\t\t\t\t\t[2759] ListMarker '■ '\n\t[3462] group ''\n\t\t[3467] DisclosureTriangle 'Films' expanded: True\n\t\t\t[3468] heading 'Films'\nHISTORY_ACTION: type [329] [The Lego Movie] [1]\nclick [2005] [The Lego Movie (franchise)]\n"}, {"role": "assistant", "content": "<think>The user's objective is to find out who voices the main character in The Lego Movie. On the current page, there is a link to \"The Lego Movie\" ([3339]). Clicking this link should lead to a page specifically about the film, where cast information is likely to be found.</think>\n```click [3339] [The Lego Movie]```"}]}
|
| 8 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: girl fox name on fox and the hound\nObservation: [1816] RootWebArea 'The Fox and the Hound' focused: True\n\t[2460] textbox \"Search 'Wikipedia'\" required: False\n\t[2462] link 'Go to welcome page'\n\t\t[2468] button '🏠'\n\t[2463] link \"Go to the main page of 'Wikipedia'\"\n\t\t[2470] button 'Wikipedia'\n\t[2464] link 'Go to a randomly selected page'\n\t\t[2472] button '🎲'\n\t[1829] table ''\n\t\t[4619] row ''\n\t\t\t[4628] gridcell 'Original theatrical release poster' required: False\n\t\t[2114] row ''\n\t\t\t[4629] rowheader 'Directed by' required: False\n\t\t\t[2115] gridcell 'Ted Berman Richard Rich Art Stevens' required: False\n\t\t\t\t[2119] ListMarker '■ '\n\t\t\t\t[5171] link 'Ted Berman'\n\t\t\t\t[2121] ListMarker '■ '\n\t\t\t\t[5173] link 'Richard Rich'\n\t\t\t\t[2123] ListMarker '■ '\n\t\t\t\t[5175] link 'Art Stevens'\n\t\t[2124] row ''\n\t\t\t[4630] rowheader 'Story by' required: False\n\t\t\t[2125] gridcell 'Larry Clemmons Ted Berman David Michener Peter Young Burny Mattinson Steve Hulett Earl Kress Vance Gerry' required: False\n\t\t\t\t[2129] ListMarker '■ '\n\t\t\t\t[5178] link 'Larry Clemmons'\n\t\t\t\t[2131] ListMarker '■ '\n\t\t\t\t[5180] StaticText 'Ted Berman'\n\t\t\t\t[2133] ListMarker '■ '\n\t\t\t\t[5181] StaticText 'David Michener'\n\t\t\t\t[2135] ListMarker '■ '\n\t\t\t\t[5182] StaticText 'Peter Young'\n\t\t\t\t[2137] ListMarker '■ '\n\t\t\t\t[5183] link 'Burny Mattinson'\n\t\t\t\t[2139] ListMarker '■ '\n\t\t\t\t[5185] StaticText 'Steve Hulett'\n\t\t\t\t[2141] ListMarker '■ '\n\t\t\t\t[5186] StaticText 'Earl Kress'\n\t\t\t\t[2143] ListMarker '■ '\n\t\t\t\t[5187] link 'Vance Gerry'\n\t\t[4620] row ''\n\t\t\t[4631] rowheader 'Based on' required: False\n\t\t\t[4632] gridcell 'The Fox and the Hound by Daniel P. Mannix' required: False\n\t\t\t\t[5191] link 'The Fox and the Hound'\n\t\t\t\t[5194] link 'Daniel P. Mannix'\n\t\t[2144] row ''\n\t\t\t[4633] rowheader 'Produced by' required: False\n\t\t\t[2145] gridcell 'Ron Miller Wolfgang Reitherman Art Stevens' required: False\n\t\t\t\t[2149] ListMarker '■ '\n\t\t\t\t[5198] link 'Ron Miller'\n\t\t\t\t[2151] ListMarker '■ '\n\t\t\t\t[5200] link 'Wolfgang Reitherman'\n\t\t\t\t[2153] ListMarker '■ '\n\t\t\t\t[5202] StaticText 'Art Stevens'\n\t\t[2154] row ''\n\t\t\t[4634] rowheader 'Starring' required: False\n\t\t\t[2155] gridcell 'Mickey Rooney Kurt Russell Pearl Bailey Jack Albertson Sandy Duncan Jeanette Nolan Pat Buttram John Fiedler John McIntire Dick Bakalyan Paul Winchell Keith Mitchell Corey Feldman' required: False\n\t\t\t\t[2159] ListMarker '■ '\n\t\t\t\t[5204] link 'Mickey Rooney'\n\t\t\t\t[2161] ListMarker '■ '\n\t\t\t\t[5206] link 'Kurt Russell'\n\t\t\t\t[2163] ListMarker '■ '\n\t\t\t\t[5208] link 'Pearl Bailey'\n\t\t\t\t[2165] ListMarker '■ '\n\t\t\t\t[5210] link 'Jack Albertson'\n\t[2563] StaticText 'Walt Disney Productions first obtained the film rights to the novel by Daniel P. Mannix in 1967; however, actual development on the film would not occur until spring 1977. The Fox and the Hound marked the last involvement of the remaining members of '\n\t[2567] link \"Disney's Nine Old Men\"\n\t[2568] StaticText ', which included '\n\t[2569] link 'Frank Thomas'\n\t[2570] StaticText ' and '\n\t[2571] link 'Ollie Johnston'\n\t[2572] StaticText '. Though the old animators had involvement in early development of the film, the film was ultimately handed over to a new generation of animators following the retirement of the old animators. As such, it was the first film for future directors including '\n\t[2573] link 'Tim Burton'\n\t[2575] link 'Brad Bird'\n\t[2576] StaticText ', and '\n\t[2577] link 'John Lasseter'\n\t[2579] link 'Don Bluth'\n\t[2580] StaticText ' and his team of animators. Further concerns were raised over the handling of the scene in which Chief is hit by a train, which was changed from his death into a non-fatal injury by which he merely suffers a broken leg.'\n\t[2582] StaticText 'The Fox and the Hound was released to theaters on July 10, 1981 by '\n\t[2584] link 'Buena Vista Distribution'\n\t[2585] StaticText '. The film was a financial success, becoming the 14th highest-grossing film of the year and earning $39.9 million in the United States. However, it initially received mixed reviews from critics, who praised the animation and voice acting of the film, but believed that the film was not groundbreaking enough. Over time, however, the film became known as an underrated classic, and it was nominated for three awards, of which it won one. At the time of its release, it was the most expensive animated film produced to date, costing $12 million.'\n\t[1843] link '[]'\n\t[2586] StaticText ' It was re-released to theaters on March 25, 1988.'\n\t[1846] link '[]'\n\t[2587] StaticText ' An intermediate follow-up, '\n\t[2589] link 'The Fox and the Hound 2'\n\t[2590] StaticText ', was released '\n\t[2591] link 'directly-to-DVD'\n\t[2592] StaticText ' on December 12, 2006.'\n\t[2198] Details ''\n\t\t[2200] DisclosureTriangle 'Plot' expanded: True\n\t\t\t[2593] heading 'Plot'\n\t\t[2598] StaticText 'After a young '\n\t\t[2599] link 'red fox'\n\t\t[2600] StaticText ' is orphaned, Big Mama the '\n\t\t[2601] link 'owl'\n\t\t[2602] StaticText ' and her friends, Dinky the '\n\t\t[2603] link 'finch'\n\t\t[2604] StaticText ' and Boomer the '\n\t\t[2605] link 'woodpecker'\n\t\t[2606] StaticText ', arrange for him to be adopted by kindly farmer Widow Tweed, who names him Tod. Meanwhile, her neighbor, hunter Amos Slade, brings home a young '\nHISTORY_ACTION: type [21] [fox and the hound] [1]\nclick [570] [link 'The Fox and the Hound']\nscroll [down]\n"}, {"role": "assistant", "content": "<think>The plot section has just started, but the answer may be further down. I should continue scrolling to find the name of the girl fox character.</think>\n```scroll [down]```"}]}
|
| 9 |
+
{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who sang with you i'm born again\nObservation: [1219] RootWebArea \"With You I'm Born Again\" focused: True\n\t[1532] textbox \"Search 'Wikipedia'\" required: False\n\t[1534] link 'Go to welcome page'\n\t\t[1540] button '🏠'\n\t[1535] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1542] button 'Wikipedia'\n\t[1536] link 'Go to a randomly selected page'\n\t\t[1544] button '🎲'\n\t[1546] heading \"With You I'm Born Again\" hasPopup: menu\n\t[1556] StaticText '\"With You I\\'m Born Again\" is a 1979 '\n\t[1560] link 'duet'\n\t[1561] StaticText ' written by '\n\t[1562] link 'Carol Connors'\n\t[1563] StaticText ' and '\n\t[1564] link 'David Shire'\n\t[1565] StaticText ' that originated on the soundtrack of the 1979 motion picture '\n\t[1567] link 'Fast Break'\n\t[1568] StaticText '. It was performed by '\n\t[1569] link 'Motown'\n\t[1570] StaticText ' recording artists '\n\t[1571] link 'Billy Preston'\n\t[1572] StaticText ' and '\n\t[1573] link 'Syreeta Wright'\n\t[1574] StaticText ' (credited as Syreeta) and became an international hit for the duo, reaching number four on the '\n\t[1575] link 'Billboard Hot 100'\n\t[1230] link '[]'\n\t[1576] StaticText ' and number two on the UK singles chart.'\n\t[1233] link '[]'\n\t[1235] table ''\n\t\t[1991] row ''\n\t\t\t[1997] columnheader '\"With You I\\'m Born Again\"' required: False\n\t\t[1992] row ''\n\t\t\t[1998] gridcell '' required: False\n\t\t[1993] row ''\n\t\t\t[1999] columnheader 'Single by Billy Preston and Syreeta Wright' required: False\n\t\t\t\t[2095] link 'Single'\n\t\t\t\t[2097] link 'Billy Preston'\n\t\t\t\t[2099] link 'Syreeta Wright'\n\t\t[1994] row ''\n\t\t\t[2000] columnheader 'from the album Fast Break' required: False\n\t\t[1237] row ''\n\t\t\t[2001] rowheader 'B-side' required: False\n\t\t\t\t[2107] link 'B-side'\n\t\t\t[1238] gridcell '\"All I Wanted Was You\"[]' required: False\n\t\t\t\t[1240] link '[]'\n\t\t[1242] row ''\n\t\t\t[2002] rowheader 'Released' required: False\n\t\t\t[1243] gridcell 'December 1979[]' required: False\n\t\t\t\t[1245] link '[]'\n\t\t[1995] row ''\n\t\t\t[2003] rowheader 'Genre' required: False\n\t\t\t\t[2121] link 'Genre'\n\t\t\t[2004] gridcell 'Soul' required: False\n\t\t\t\t[2123] link 'Soul'\n\t\t[1247] row ''\n\t\t\t[2005] rowheader 'Label' required: False\n\t\t\t\t[2125] link 'Label'\n\t\t\t[1248] gridcell 'Motown[]' required: False\n\t\t\t\t[2127] link 'Motown'\n\t\t\t\t[1250] link '[]'\n\t\t[1996] row ''\n\t\t\t[2006] rowheader 'Songwriter(s)' required: False\n\t\t\t\t[2134] link 'Songwriter(s)'\n\t\t\t[2007] gridcell 'Carol Connors · David Shire' required: False\n\t\t\t\t[2140] link 'Carol Connors'\n\t\t\t\t[2144] link 'David Shire'\n\t\t[1252] row ''\n\t\t\t[2008] rowheader 'Producer(s)' required: False\n\t\t\t\t[2147] link 'Producer(s)'\n\t\t\t[1253] gridcell 'James DiPasquale[]' required: False\n\t\t\t\t[2149] link 'James DiPasquale'\n\t\t\t\t[1255] link '[]'\n\t[1257] Details ''\n\t\t[1429] DisclosureTriangle 'History' expanded: True\n\t\t\t[1577] heading 'History'\n\t\t[1579] StaticText 'The music from '\n\t\t[1581] link 'Fast Break'\n\t\t[1582] StaticText ' was credited to '\n\t\t[1583] link 'David Shire'\n\t\t[1584] StaticText ' and '\n\t\t[1585] link 'James DiPasquale'\n\t\t[1586] StaticText ', and '\n\t\t[1587] link 'Carol Connors'\n\t\t[1588] StaticText ' put words to four of the songs on the soundtrack album. Connors recalls, \"I had written most of the lyrics to \\'With You I\\'m Born Again\\' in 22 minutes, but I couldn\\'t find two words and it was driving me crazy.\"'\n\t\t[1261] link '[]'\n\t\t[1589] StaticText ' The right words came to her on her way to an ice cream parlor, where she requested a pencil and paper along with her order. \"I had finally found it—the line I was seeking--\\'Come show me your kindness, in your arms I know I\\'ll find this\\'—and I wanted to get it down on paper.\"'\n\t\t[1264] link '[]'\n\t\t[1590] link 'Motown'\n\t\t[1591] StaticText ' executive '\n\t\t[1592] link 'Suzanne de Passe'\n\t\t[1268] link '[]'\n\t\t[1594] StaticText ' and each singer also had a solo number to cut for the soundtrack album as well. Connors recalled, \"Billy and Syreeta were originally not very fond of the idea of recording together—they each wanted to do their own thing, but this worked out extremely well for both of them.\"'\n\t\t[1271] link '[]'\n\t\t[1595] StaticText ' The reluctant pair were surprised by the success of \"With You I\\'m Born Again\". \"Though both liked the song, they never dreamed of it being a hit single.\"'\n\t\t[1274] link '[]'\n\t\t[1596] StaticText 'Connors noted the impression some were left with by the song and Preston. \"When David Shire heard Billy Preston play the song on the organ, he said to me, \\'Now I know why he\\'s a star.\\'\" Connors also shared, \"The late great '\nHISTORY_ACTION: type [21] [With You I'm Born Again] [1]\nclick [570] [link \"With You I'm Born Again\"]\n"}, {"role": "assistant", "content": "<think>The answer can be found in the static text: '\"With You I'm Born Again\"... It was performed by Motown recording artists Billy Preston and Syreeta Wright (credited as Syreeta) and became an international hit for the duo.' The singers are Billy Preston and Syreeta Wright.</think>\n```stop [Billy Preston and Syreeta Wright]```"}]}
|
qwen2.5-7b-base-1500q-1500q-old-click-4ep/args.json
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "epoch",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 4,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 2,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 1e-06,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 4.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 1,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "epoch",
|
| 38 |
+
"save_steps": 500,
|
| 39 |
+
"save_total_limit": null,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"use_ipex": false,
|
| 51 |
+
"bf16": true,
|
| 52 |
+
"fp16": false,
|
| 53 |
+
"fp16_opt_level": "O1",
|
| 54 |
+
"half_precision_backend": "auto",
|
| 55 |
+
"bf16_full_eval": false,
|
| 56 |
+
"fp16_full_eval": false,
|
| 57 |
+
"tf32": null,
|
| 58 |
+
"local_rank": 0,
|
| 59 |
+
"ddp_backend": null,
|
| 60 |
+
"tpu_num_cores": null,
|
| 61 |
+
"tpu_metrics_debug": false,
|
| 62 |
+
"debug": null,
|
| 63 |
+
"dataloader_drop_last": false,
|
| 64 |
+
"eval_steps": 2000.0,
|
| 65 |
+
"dataloader_num_workers": 48,
|
| 66 |
+
"dataloader_prefetch_factor": null,
|
| 67 |
+
"past_index": -1,
|
| 68 |
+
"run_name": "/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458",
|
| 69 |
+
"disable_tqdm": null,
|
| 70 |
+
"remove_unused_columns": true,
|
| 71 |
+
"label_names": null,
|
| 72 |
+
"load_best_model_at_end": false,
|
| 73 |
+
"metric_for_best_model": "loss",
|
| 74 |
+
"greater_is_better": false,
|
| 75 |
+
"ignore_data_skip": false,
|
| 76 |
+
"fsdp": "",
|
| 77 |
+
"fsdp_min_num_params": 0,
|
| 78 |
+
"fsdp_config": null,
|
| 79 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 80 |
+
"accelerator_config": {
|
| 81 |
+
"dispatch_batches": false
|
| 82 |
+
},
|
| 83 |
+
"deepspeed": {
|
| 84 |
+
"fp16": {
|
| 85 |
+
"enabled": "auto",
|
| 86 |
+
"loss_scale": 0,
|
| 87 |
+
"loss_scale_window": 1000,
|
| 88 |
+
"initial_scale_power": 16,
|
| 89 |
+
"hysteresis": 2,
|
| 90 |
+
"min_loss_scale": 1
|
| 91 |
+
},
|
| 92 |
+
"bf16": {
|
| 93 |
+
"enabled": "auto"
|
| 94 |
+
},
|
| 95 |
+
"zero_optimization": {
|
| 96 |
+
"stage": 3,
|
| 97 |
+
"offload_optimizer": {
|
| 98 |
+
"device": "none",
|
| 99 |
+
"pin_memory": true
|
| 100 |
+
},
|
| 101 |
+
"offload_param": {
|
| 102 |
+
"device": "none",
|
| 103 |
+
"pin_memory": true
|
| 104 |
+
},
|
| 105 |
+
"overlap_comm": false,
|
| 106 |
+
"contiguous_gradients": true,
|
| 107 |
+
"sub_group_size": 1000000000.0,
|
| 108 |
+
"reduce_bucket_size": "auto",
|
| 109 |
+
"zero_quantized_weights": false,
|
| 110 |
+
"zero_quantized_gradients": false,
|
| 111 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 112 |
+
"stage3_param_persistence_threshold": "auto",
|
| 113 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 114 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 115 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 116 |
+
},
|
| 117 |
+
"gradient_accumulation_steps": "auto",
|
| 118 |
+
"gradient_clipping": "auto",
|
| 119 |
+
"steps_per_print": 2000,
|
| 120 |
+
"train_batch_size": "auto",
|
| 121 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 122 |
+
"wall_clock_breakdown": false
|
| 123 |
+
},
|
| 124 |
+
"label_smoothing_factor": 0.0,
|
| 125 |
+
"optim": "adamw_torch_fused",
|
| 126 |
+
"optim_args": null,
|
| 127 |
+
"adafactor": false,
|
| 128 |
+
"group_by_length": false,
|
| 129 |
+
"length_column_name": "length",
|
| 130 |
+
"report_to": [
|
| 131 |
+
"tensorboard"
|
| 132 |
+
],
|
| 133 |
+
"ddp_find_unused_parameters": null,
|
| 134 |
+
"ddp_bucket_cap_mb": null,
|
| 135 |
+
"ddp_broadcast_buffers": null,
|
| 136 |
+
"dataloader_pin_memory": true,
|
| 137 |
+
"dataloader_persistent_workers": false,
|
| 138 |
+
"skip_memory_metrics": true,
|
| 139 |
+
"use_legacy_prediction_loop": false,
|
| 140 |
+
"push_to_hub": false,
|
| 141 |
+
"resume_from_checkpoint": null,
|
| 142 |
+
"hub_model_id": null,
|
| 143 |
+
"hub_strategy": "every_save",
|
| 144 |
+
"hub_token": null,
|
| 145 |
+
"hub_private_repo": null,
|
| 146 |
+
"hub_always_push": false,
|
| 147 |
+
"hub_revision": null,
|
| 148 |
+
"gradient_checkpointing": true,
|
| 149 |
+
"gradient_checkpointing_kwargs": null,
|
| 150 |
+
"include_inputs_for_metrics": false,
|
| 151 |
+
"include_for_metrics": [],
|
| 152 |
+
"eval_do_concat_batches": true,
|
| 153 |
+
"fp16_backend": "auto",
|
| 154 |
+
"push_to_hub_model_id": null,
|
| 155 |
+
"push_to_hub_organization": null,
|
| 156 |
+
"push_to_hub_token": null,
|
| 157 |
+
"mp_parameters": "",
|
| 158 |
+
"auto_find_batch_size": false,
|
| 159 |
+
"full_determinism": false,
|
| 160 |
+
"torchdynamo": null,
|
| 161 |
+
"ray_scope": "last",
|
| 162 |
+
"ddp_timeout": 18000000,
|
| 163 |
+
"torch_compile": false,
|
| 164 |
+
"torch_compile_backend": null,
|
| 165 |
+
"torch_compile_mode": null,
|
| 166 |
+
"include_tokens_per_second": false,
|
| 167 |
+
"include_num_input_tokens_seen": false,
|
| 168 |
+
"neftune_noise_alpha": null,
|
| 169 |
+
"optim_target_modules": null,
|
| 170 |
+
"batch_eval_metrics": false,
|
| 171 |
+
"eval_on_start": false,
|
| 172 |
+
"use_liger_kernel": false,
|
| 173 |
+
"liger_kernel_config": null,
|
| 174 |
+
"eval_use_gather_object": false,
|
| 175 |
+
"average_tokens_across_devices": true,
|
| 176 |
+
"sortish_sampler": false,
|
| 177 |
+
"predict_with_generate": false,
|
| 178 |
+
"generation_max_length": null,
|
| 179 |
+
"generation_num_beams": null,
|
| 180 |
+
"generation_config": null,
|
| 181 |
+
"tuner_backend": "peft",
|
| 182 |
+
"vit_gradient_checkpointing": null,
|
| 183 |
+
"router_aux_loss_coef": 0.0,
|
| 184 |
+
"enable_dft_loss": false,
|
| 185 |
+
"check_model": true,
|
| 186 |
+
"acc_strategy": "token",
|
| 187 |
+
"train_dataloader_shuffle": true,
|
| 188 |
+
"max_epochs": null,
|
| 189 |
+
"aligner_lr": null,
|
| 190 |
+
"vit_lr": null,
|
| 191 |
+
"use_logits_to_keep": null,
|
| 192 |
+
"channels": null,
|
| 193 |
+
"ds3_gather_for_generation": true,
|
| 194 |
+
"resume_only_model": false,
|
| 195 |
+
"optimizer": null,
|
| 196 |
+
"loss_type": null,
|
| 197 |
+
"metric": null,
|
| 198 |
+
"eval_use_evalscope": false,
|
| 199 |
+
"eval_dataset": [],
|
| 200 |
+
"eval_dataset_args": null,
|
| 201 |
+
"eval_limit": null,
|
| 202 |
+
"eval_generation_config": null,
|
| 203 |
+
"extra_eval_args": null,
|
| 204 |
+
"use_flash_ckpt": false,
|
| 205 |
+
"model": "Qwen/Qwen2.5-7B",
|
| 206 |
+
"model_type": "qwen2_5",
|
| 207 |
+
"model_revision": null,
|
| 208 |
+
"task_type": "causal_lm",
|
| 209 |
+
"torch_dtype": "bfloat16",
|
| 210 |
+
"attn_impl": null,
|
| 211 |
+
"new_special_tokens": [],
|
| 212 |
+
"num_labels": null,
|
| 213 |
+
"problem_type": null,
|
| 214 |
+
"rope_scaling": null,
|
| 215 |
+
"device_map": null,
|
| 216 |
+
"max_memory": {},
|
| 217 |
+
"max_model_len": null,
|
| 218 |
+
"local_repo_path": null,
|
| 219 |
+
"init_strategy": null,
|
| 220 |
+
"template": "qwen2_5",
|
| 221 |
+
"system": null,
|
| 222 |
+
"max_length": 16240,
|
| 223 |
+
"truncation_strategy": "delete",
|
| 224 |
+
"max_pixels": null,
|
| 225 |
+
"agent_template": null,
|
| 226 |
+
"norm_bbox": null,
|
| 227 |
+
"use_chat_template": true,
|
| 228 |
+
"padding_free": false,
|
| 229 |
+
"padding_side": "right",
|
| 230 |
+
"loss_scale": "default",
|
| 231 |
+
"sequence_parallel_size": 1,
|
| 232 |
+
"response_prefix": null,
|
| 233 |
+
"template_backend": "swift",
|
| 234 |
+
"dataset": [
|
| 235 |
+
"corr_nq_1500q_hotpot_1500q_swift.jsonl"
|
| 236 |
+
],
|
| 237 |
+
"val_dataset": [],
|
| 238 |
+
"split_dataset_ratio": 0.001,
|
| 239 |
+
"dataset_num_proc": 100,
|
| 240 |
+
"load_from_cache_file": true,
|
| 241 |
+
"dataset_shuffle": true,
|
| 242 |
+
"val_dataset_shuffle": false,
|
| 243 |
+
"streaming": false,
|
| 244 |
+
"interleave_prob": null,
|
| 245 |
+
"stopping_strategy": "first_exhausted",
|
| 246 |
+
"shuffle_buffer_size": 1000,
|
| 247 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 248 |
+
"columns": {},
|
| 249 |
+
"strict": false,
|
| 250 |
+
"model_name": null,
|
| 251 |
+
"model_author": null,
|
| 252 |
+
"custom_dataset_info": [],
|
| 253 |
+
"quant_method": null,
|
| 254 |
+
"quant_bits": null,
|
| 255 |
+
"hqq_axis": null,
|
| 256 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 257 |
+
"bnb_4bit_quant_type": "nf4",
|
| 258 |
+
"bnb_4bit_use_double_quant": true,
|
| 259 |
+
"bnb_4bit_quant_storage": null,
|
| 260 |
+
"max_new_tokens": 64,
|
| 261 |
+
"temperature": 0.0,
|
| 262 |
+
"top_k": null,
|
| 263 |
+
"top_p": null,
|
| 264 |
+
"repetition_penalty": null,
|
| 265 |
+
"num_beams": 1,
|
| 266 |
+
"stream": false,
|
| 267 |
+
"stop_words": [],
|
| 268 |
+
"logprobs": false,
|
| 269 |
+
"top_logprobs": null,
|
| 270 |
+
"ckpt_dir": null,
|
| 271 |
+
"lora_modules": [],
|
| 272 |
+
"train_type": "full",
|
| 273 |
+
"adapters": [],
|
| 274 |
+
"external_plugins": [],
|
| 275 |
+
"model_kwargs": {},
|
| 276 |
+
"load_args": false,
|
| 277 |
+
"load_data_args": false,
|
| 278 |
+
"packing": false,
|
| 279 |
+
"packing_length": null,
|
| 280 |
+
"lazy_tokenize": false,
|
| 281 |
+
"cached_dataset": [],
|
| 282 |
+
"custom_register_path": [],
|
| 283 |
+
"use_hf": false,
|
| 284 |
+
"ignore_args_error": false,
|
| 285 |
+
"use_swift_lora": false,
|
| 286 |
+
"freeze_parameters": [],
|
| 287 |
+
"freeze_parameters_regex": null,
|
| 288 |
+
"freeze_parameters_ratio": 0.0,
|
| 289 |
+
"trainable_parameters": [],
|
| 290 |
+
"trainable_parameters_regex": null,
|
| 291 |
+
"freeze_llm": false,
|
| 292 |
+
"freeze_vit": true,
|
| 293 |
+
"freeze_aligner": false,
|
| 294 |
+
"target_modules": [
|
| 295 |
+
"all-linear"
|
| 296 |
+
],
|
| 297 |
+
"target_regex": null,
|
| 298 |
+
"modules_to_save": [],
|
| 299 |
+
"lora_rank": 8,
|
| 300 |
+
"lora_alpha": 32,
|
| 301 |
+
"lora_dropout": 0.05,
|
| 302 |
+
"lora_bias": "none",
|
| 303 |
+
"lora_dtype": null,
|
| 304 |
+
"lorap_lr_ratio": null,
|
| 305 |
+
"use_rslora": false,
|
| 306 |
+
"use_dora": false,
|
| 307 |
+
"lora_ga_batch_size": 2,
|
| 308 |
+
"lora_ga_iters": 2,
|
| 309 |
+
"lora_ga_max_length": 1024,
|
| 310 |
+
"lora_ga_direction": "ArB2r",
|
| 311 |
+
"lora_ga_scale": "stable",
|
| 312 |
+
"lora_ga_stable_gamma": 16,
|
| 313 |
+
"init_weights": true,
|
| 314 |
+
"fourier_n_frequency": 2000,
|
| 315 |
+
"fourier_scaling": 300.0,
|
| 316 |
+
"boft_block_size": 4,
|
| 317 |
+
"boft_block_num": 0,
|
| 318 |
+
"boft_n_butterfly_factor": 1,
|
| 319 |
+
"boft_dropout": 0.0,
|
| 320 |
+
"vera_rank": 256,
|
| 321 |
+
"vera_projection_prng_key": 0,
|
| 322 |
+
"vera_dropout": 0.0,
|
| 323 |
+
"vera_d_initial": 0.1,
|
| 324 |
+
"adapter_act": "gelu",
|
| 325 |
+
"adapter_length": 128,
|
| 326 |
+
"use_galore": false,
|
| 327 |
+
"galore_target_modules": null,
|
| 328 |
+
"galore_rank": 128,
|
| 329 |
+
"galore_update_proj_gap": 50,
|
| 330 |
+
"galore_scale": 1.0,
|
| 331 |
+
"galore_proj_type": "std",
|
| 332 |
+
"galore_optim_per_parameter": false,
|
| 333 |
+
"galore_with_embedding": false,
|
| 334 |
+
"galore_quantization": false,
|
| 335 |
+
"galore_proj_quant": false,
|
| 336 |
+
"galore_proj_bits": 4,
|
| 337 |
+
"galore_proj_group_size": 256,
|
| 338 |
+
"galore_cos_threshold": 0.4,
|
| 339 |
+
"galore_gamma_proj": 2,
|
| 340 |
+
"galore_queue_size": 5,
|
| 341 |
+
"adalora_target_r": 8,
|
| 342 |
+
"adalora_init_r": 12,
|
| 343 |
+
"adalora_tinit": 0,
|
| 344 |
+
"adalora_tfinal": 0,
|
| 345 |
+
"adalora_deltaT": 1,
|
| 346 |
+
"adalora_beta1": 0.85,
|
| 347 |
+
"adalora_beta2": 0.85,
|
| 348 |
+
"adalora_orth_reg_weight": 0.5,
|
| 349 |
+
"llamapro_num_new_blocks": 4,
|
| 350 |
+
"llamapro_num_groups": null,
|
| 351 |
+
"lisa_activated_layers": 0,
|
| 352 |
+
"lisa_step_interval": 20,
|
| 353 |
+
"reft_layer_key": null,
|
| 354 |
+
"reft_layers": null,
|
| 355 |
+
"reft_rank": 4,
|
| 356 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 357 |
+
"reft_args": null,
|
| 358 |
+
"swanlab_token": null,
|
| 359 |
+
"swanlab_project": null,
|
| 360 |
+
"swanlab_workspace": null,
|
| 361 |
+
"swanlab_exp_name": null,
|
| 362 |
+
"swanlab_lark_webhook_url": null,
|
| 363 |
+
"swanlab_lark_secret": null,
|
| 364 |
+
"swanlab_mode": "cloud",
|
| 365 |
+
"add_version": true,
|
| 366 |
+
"create_checkpoint_symlink": false,
|
| 367 |
+
"zero_hpz_partition_size": null,
|
| 368 |
+
"deepspeed_autotp_size": null,
|
| 369 |
+
"early_stop_interval": null,
|
| 370 |
+
"rank": 0,
|
| 371 |
+
"global_world_size": 8,
|
| 372 |
+
"local_world_size": 8,
|
| 373 |
+
"model_suffix": "Qwen2.5-7B",
|
| 374 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
|
| 375 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fe277760ee0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
|
| 376 |
+
"model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
|
| 377 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 378 |
+
"evaluation_strategy": "epoch",
|
| 379 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=4.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
|
| 380 |
+
}
|
qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/args.json
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "epoch",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 4,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 2,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 1e-06,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 4.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 1,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "epoch",
|
| 38 |
+
"save_steps": 500,
|
| 39 |
+
"save_total_limit": null,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"use_ipex": false,
|
| 51 |
+
"bf16": true,
|
| 52 |
+
"fp16": false,
|
| 53 |
+
"fp16_opt_level": "O1",
|
| 54 |
+
"half_precision_backend": "auto",
|
| 55 |
+
"bf16_full_eval": false,
|
| 56 |
+
"fp16_full_eval": false,
|
| 57 |
+
"tf32": null,
|
| 58 |
+
"local_rank": 0,
|
| 59 |
+
"ddp_backend": null,
|
| 60 |
+
"tpu_num_cores": null,
|
| 61 |
+
"tpu_metrics_debug": false,
|
| 62 |
+
"debug": null,
|
| 63 |
+
"dataloader_drop_last": false,
|
| 64 |
+
"eval_steps": 2000.0,
|
| 65 |
+
"dataloader_num_workers": 48,
|
| 66 |
+
"dataloader_prefetch_factor": null,
|
| 67 |
+
"past_index": -1,
|
| 68 |
+
"run_name": "/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458",
|
| 69 |
+
"disable_tqdm": null,
|
| 70 |
+
"remove_unused_columns": true,
|
| 71 |
+
"label_names": null,
|
| 72 |
+
"load_best_model_at_end": false,
|
| 73 |
+
"metric_for_best_model": "loss",
|
| 74 |
+
"greater_is_better": false,
|
| 75 |
+
"ignore_data_skip": false,
|
| 76 |
+
"fsdp": "",
|
| 77 |
+
"fsdp_min_num_params": 0,
|
| 78 |
+
"fsdp_config": null,
|
| 79 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 80 |
+
"accelerator_config": {
|
| 81 |
+
"dispatch_batches": false
|
| 82 |
+
},
|
| 83 |
+
"deepspeed": {
|
| 84 |
+
"fp16": {
|
| 85 |
+
"enabled": "auto",
|
| 86 |
+
"loss_scale": 0,
|
| 87 |
+
"loss_scale_window": 1000,
|
| 88 |
+
"initial_scale_power": 16,
|
| 89 |
+
"hysteresis": 2,
|
| 90 |
+
"min_loss_scale": 1
|
| 91 |
+
},
|
| 92 |
+
"bf16": {
|
| 93 |
+
"enabled": "auto"
|
| 94 |
+
},
|
| 95 |
+
"zero_optimization": {
|
| 96 |
+
"stage": 3,
|
| 97 |
+
"offload_optimizer": {
|
| 98 |
+
"device": "none",
|
| 99 |
+
"pin_memory": true
|
| 100 |
+
},
|
| 101 |
+
"offload_param": {
|
| 102 |
+
"device": "none",
|
| 103 |
+
"pin_memory": true
|
| 104 |
+
},
|
| 105 |
+
"overlap_comm": false,
|
| 106 |
+
"contiguous_gradients": true,
|
| 107 |
+
"sub_group_size": 1000000000.0,
|
| 108 |
+
"reduce_bucket_size": "auto",
|
| 109 |
+
"zero_quantized_weights": false,
|
| 110 |
+
"zero_quantized_gradients": false,
|
| 111 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 112 |
+
"stage3_param_persistence_threshold": "auto",
|
| 113 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 114 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 115 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 116 |
+
},
|
| 117 |
+
"gradient_accumulation_steps": "auto",
|
| 118 |
+
"gradient_clipping": "auto",
|
| 119 |
+
"steps_per_print": 2000,
|
| 120 |
+
"train_batch_size": "auto",
|
| 121 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 122 |
+
"wall_clock_breakdown": false
|
| 123 |
+
},
|
| 124 |
+
"label_smoothing_factor": 0.0,
|
| 125 |
+
"optim": "adamw_torch_fused",
|
| 126 |
+
"optim_args": null,
|
| 127 |
+
"adafactor": false,
|
| 128 |
+
"group_by_length": false,
|
| 129 |
+
"length_column_name": "length",
|
| 130 |
+
"report_to": [
|
| 131 |
+
"tensorboard"
|
| 132 |
+
],
|
| 133 |
+
"ddp_find_unused_parameters": null,
|
| 134 |
+
"ddp_bucket_cap_mb": null,
|
| 135 |
+
"ddp_broadcast_buffers": null,
|
| 136 |
+
"dataloader_pin_memory": true,
|
| 137 |
+
"dataloader_persistent_workers": false,
|
| 138 |
+
"skip_memory_metrics": true,
|
| 139 |
+
"use_legacy_prediction_loop": false,
|
| 140 |
+
"push_to_hub": false,
|
| 141 |
+
"resume_from_checkpoint": null,
|
| 142 |
+
"hub_model_id": null,
|
| 143 |
+
"hub_strategy": "every_save",
|
| 144 |
+
"hub_token": null,
|
| 145 |
+
"hub_private_repo": null,
|
| 146 |
+
"hub_always_push": false,
|
| 147 |
+
"hub_revision": null,
|
| 148 |
+
"gradient_checkpointing": true,
|
| 149 |
+
"gradient_checkpointing_kwargs": null,
|
| 150 |
+
"include_inputs_for_metrics": false,
|
| 151 |
+
"include_for_metrics": [],
|
| 152 |
+
"eval_do_concat_batches": true,
|
| 153 |
+
"fp16_backend": "auto",
|
| 154 |
+
"push_to_hub_model_id": null,
|
| 155 |
+
"push_to_hub_organization": null,
|
| 156 |
+
"push_to_hub_token": null,
|
| 157 |
+
"mp_parameters": "",
|
| 158 |
+
"auto_find_batch_size": false,
|
| 159 |
+
"full_determinism": false,
|
| 160 |
+
"torchdynamo": null,
|
| 161 |
+
"ray_scope": "last",
|
| 162 |
+
"ddp_timeout": 18000000,
|
| 163 |
+
"torch_compile": false,
|
| 164 |
+
"torch_compile_backend": null,
|
| 165 |
+
"torch_compile_mode": null,
|
| 166 |
+
"include_tokens_per_second": false,
|
| 167 |
+
"include_num_input_tokens_seen": false,
|
| 168 |
+
"neftune_noise_alpha": null,
|
| 169 |
+
"optim_target_modules": null,
|
| 170 |
+
"batch_eval_metrics": false,
|
| 171 |
+
"eval_on_start": false,
|
| 172 |
+
"use_liger_kernel": false,
|
| 173 |
+
"liger_kernel_config": null,
|
| 174 |
+
"eval_use_gather_object": false,
|
| 175 |
+
"average_tokens_across_devices": true,
|
| 176 |
+
"sortish_sampler": false,
|
| 177 |
+
"predict_with_generate": false,
|
| 178 |
+
"generation_max_length": null,
|
| 179 |
+
"generation_num_beams": null,
|
| 180 |
+
"generation_config": null,
|
| 181 |
+
"tuner_backend": "peft",
|
| 182 |
+
"vit_gradient_checkpointing": null,
|
| 183 |
+
"router_aux_loss_coef": 0.0,
|
| 184 |
+
"enable_dft_loss": false,
|
| 185 |
+
"check_model": true,
|
| 186 |
+
"acc_strategy": "token",
|
| 187 |
+
"train_dataloader_shuffle": true,
|
| 188 |
+
"max_epochs": null,
|
| 189 |
+
"aligner_lr": null,
|
| 190 |
+
"vit_lr": null,
|
| 191 |
+
"use_logits_to_keep": null,
|
| 192 |
+
"channels": null,
|
| 193 |
+
"ds3_gather_for_generation": true,
|
| 194 |
+
"resume_only_model": false,
|
| 195 |
+
"optimizer": null,
|
| 196 |
+
"loss_type": null,
|
| 197 |
+
"metric": null,
|
| 198 |
+
"eval_use_evalscope": false,
|
| 199 |
+
"eval_dataset": [],
|
| 200 |
+
"eval_dataset_args": null,
|
| 201 |
+
"eval_limit": null,
|
| 202 |
+
"eval_generation_config": null,
|
| 203 |
+
"extra_eval_args": null,
|
| 204 |
+
"use_flash_ckpt": false,
|
| 205 |
+
"model": "Qwen/Qwen2.5-7B",
|
| 206 |
+
"model_type": "qwen2_5",
|
| 207 |
+
"model_revision": null,
|
| 208 |
+
"task_type": "causal_lm",
|
| 209 |
+
"torch_dtype": "bfloat16",
|
| 210 |
+
"attn_impl": null,
|
| 211 |
+
"new_special_tokens": [],
|
| 212 |
+
"num_labels": null,
|
| 213 |
+
"problem_type": null,
|
| 214 |
+
"rope_scaling": null,
|
| 215 |
+
"device_map": null,
|
| 216 |
+
"max_memory": {},
|
| 217 |
+
"max_model_len": null,
|
| 218 |
+
"local_repo_path": null,
|
| 219 |
+
"init_strategy": null,
|
| 220 |
+
"template": "qwen2_5",
|
| 221 |
+
"system": null,
|
| 222 |
+
"max_length": 16240,
|
| 223 |
+
"truncation_strategy": "delete",
|
| 224 |
+
"max_pixels": null,
|
| 225 |
+
"agent_template": null,
|
| 226 |
+
"norm_bbox": null,
|
| 227 |
+
"use_chat_template": true,
|
| 228 |
+
"padding_free": false,
|
| 229 |
+
"padding_side": "right",
|
| 230 |
+
"loss_scale": "default",
|
| 231 |
+
"sequence_parallel_size": 1,
|
| 232 |
+
"response_prefix": null,
|
| 233 |
+
"template_backend": "swift",
|
| 234 |
+
"dataset": [
|
| 235 |
+
"corr_nq_1500q_hotpot_1500q_swift.jsonl"
|
| 236 |
+
],
|
| 237 |
+
"val_dataset": [],
|
| 238 |
+
"split_dataset_ratio": 0.001,
|
| 239 |
+
"dataset_num_proc": 100,
|
| 240 |
+
"load_from_cache_file": true,
|
| 241 |
+
"dataset_shuffle": true,
|
| 242 |
+
"val_dataset_shuffle": false,
|
| 243 |
+
"streaming": false,
|
| 244 |
+
"interleave_prob": null,
|
| 245 |
+
"stopping_strategy": "first_exhausted",
|
| 246 |
+
"shuffle_buffer_size": 1000,
|
| 247 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 248 |
+
"columns": {},
|
| 249 |
+
"strict": false,
|
| 250 |
+
"model_name": null,
|
| 251 |
+
"model_author": null,
|
| 252 |
+
"custom_dataset_info": [],
|
| 253 |
+
"quant_method": null,
|
| 254 |
+
"quant_bits": null,
|
| 255 |
+
"hqq_axis": null,
|
| 256 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 257 |
+
"bnb_4bit_quant_type": "nf4",
|
| 258 |
+
"bnb_4bit_use_double_quant": true,
|
| 259 |
+
"bnb_4bit_quant_storage": null,
|
| 260 |
+
"max_new_tokens": 64,
|
| 261 |
+
"temperature": 0.0,
|
| 262 |
+
"top_k": null,
|
| 263 |
+
"top_p": null,
|
| 264 |
+
"repetition_penalty": null,
|
| 265 |
+
"num_beams": 1,
|
| 266 |
+
"stream": false,
|
| 267 |
+
"stop_words": [],
|
| 268 |
+
"logprobs": false,
|
| 269 |
+
"top_logprobs": null,
|
| 270 |
+
"ckpt_dir": null,
|
| 271 |
+
"lora_modules": [],
|
| 272 |
+
"train_type": "full",
|
| 273 |
+
"adapters": [],
|
| 274 |
+
"external_plugins": [],
|
| 275 |
+
"model_kwargs": {},
|
| 276 |
+
"load_args": false,
|
| 277 |
+
"load_data_args": false,
|
| 278 |
+
"packing": false,
|
| 279 |
+
"packing_length": null,
|
| 280 |
+
"lazy_tokenize": false,
|
| 281 |
+
"cached_dataset": [],
|
| 282 |
+
"custom_register_path": [],
|
| 283 |
+
"use_hf": false,
|
| 284 |
+
"ignore_args_error": false,
|
| 285 |
+
"use_swift_lora": false,
|
| 286 |
+
"freeze_parameters": [],
|
| 287 |
+
"freeze_parameters_regex": null,
|
| 288 |
+
"freeze_parameters_ratio": 0.0,
|
| 289 |
+
"trainable_parameters": [],
|
| 290 |
+
"trainable_parameters_regex": null,
|
| 291 |
+
"freeze_llm": false,
|
| 292 |
+
"freeze_vit": true,
|
| 293 |
+
"freeze_aligner": false,
|
| 294 |
+
"target_modules": [
|
| 295 |
+
"all-linear"
|
| 296 |
+
],
|
| 297 |
+
"target_regex": null,
|
| 298 |
+
"modules_to_save": [],
|
| 299 |
+
"lora_rank": 8,
|
| 300 |
+
"lora_alpha": 32,
|
| 301 |
+
"lora_dropout": 0.05,
|
| 302 |
+
"lora_bias": "none",
|
| 303 |
+
"lora_dtype": null,
|
| 304 |
+
"lorap_lr_ratio": null,
|
| 305 |
+
"use_rslora": false,
|
| 306 |
+
"use_dora": false,
|
| 307 |
+
"lora_ga_batch_size": 2,
|
| 308 |
+
"lora_ga_iters": 2,
|
| 309 |
+
"lora_ga_max_length": 1024,
|
| 310 |
+
"lora_ga_direction": "ArB2r",
|
| 311 |
+
"lora_ga_scale": "stable",
|
| 312 |
+
"lora_ga_stable_gamma": 16,
|
| 313 |
+
"init_weights": true,
|
| 314 |
+
"fourier_n_frequency": 2000,
|
| 315 |
+
"fourier_scaling": 300.0,
|
| 316 |
+
"boft_block_size": 4,
|
| 317 |
+
"boft_block_num": 0,
|
| 318 |
+
"boft_n_butterfly_factor": 1,
|
| 319 |
+
"boft_dropout": 0.0,
|
| 320 |
+
"vera_rank": 256,
|
| 321 |
+
"vera_projection_prng_key": 0,
|
| 322 |
+
"vera_dropout": 0.0,
|
| 323 |
+
"vera_d_initial": 0.1,
|
| 324 |
+
"adapter_act": "gelu",
|
| 325 |
+
"adapter_length": 128,
|
| 326 |
+
"use_galore": false,
|
| 327 |
+
"galore_target_modules": null,
|
| 328 |
+
"galore_rank": 128,
|
| 329 |
+
"galore_update_proj_gap": 50,
|
| 330 |
+
"galore_scale": 1.0,
|
| 331 |
+
"galore_proj_type": "std",
|
| 332 |
+
"galore_optim_per_parameter": false,
|
| 333 |
+
"galore_with_embedding": false,
|
| 334 |
+
"galore_quantization": false,
|
| 335 |
+
"galore_proj_quant": false,
|
| 336 |
+
"galore_proj_bits": 4,
|
| 337 |
+
"galore_proj_group_size": 256,
|
| 338 |
+
"galore_cos_threshold": 0.4,
|
| 339 |
+
"galore_gamma_proj": 2,
|
| 340 |
+
"galore_queue_size": 5,
|
| 341 |
+
"adalora_target_r": 8,
|
| 342 |
+
"adalora_init_r": 12,
|
| 343 |
+
"adalora_tinit": 0,
|
| 344 |
+
"adalora_tfinal": 0,
|
| 345 |
+
"adalora_deltaT": 1,
|
| 346 |
+
"adalora_beta1": 0.85,
|
| 347 |
+
"adalora_beta2": 0.85,
|
| 348 |
+
"adalora_orth_reg_weight": 0.5,
|
| 349 |
+
"llamapro_num_new_blocks": 4,
|
| 350 |
+
"llamapro_num_groups": null,
|
| 351 |
+
"lisa_activated_layers": 0,
|
| 352 |
+
"lisa_step_interval": 20,
|
| 353 |
+
"reft_layer_key": null,
|
| 354 |
+
"reft_layers": null,
|
| 355 |
+
"reft_rank": 4,
|
| 356 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 357 |
+
"reft_args": null,
|
| 358 |
+
"swanlab_token": null,
|
| 359 |
+
"swanlab_project": null,
|
| 360 |
+
"swanlab_workspace": null,
|
| 361 |
+
"swanlab_exp_name": null,
|
| 362 |
+
"swanlab_lark_webhook_url": null,
|
| 363 |
+
"swanlab_lark_secret": null,
|
| 364 |
+
"swanlab_mode": "cloud",
|
| 365 |
+
"add_version": true,
|
| 366 |
+
"create_checkpoint_symlink": false,
|
| 367 |
+
"zero_hpz_partition_size": null,
|
| 368 |
+
"deepspeed_autotp_size": null,
|
| 369 |
+
"early_stop_interval": null,
|
| 370 |
+
"rank": 0,
|
| 371 |
+
"global_world_size": 8,
|
| 372 |
+
"local_world_size": 8,
|
| 373 |
+
"model_suffix": "Qwen2.5-7B",
|
| 374 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
|
| 375 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fe277760ee0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
|
| 376 |
+
"model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
|
| 377 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 378 |
+
"evaluation_strategy": "epoch",
|
| 379 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=4.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v0-20250824-201458', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
|
| 380 |
+
}
|
qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"eos_token_id": 151643,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 3584,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 18944,
|
| 12 |
+
"layer_types": [
|
| 13 |
+
"full_attention",
|
| 14 |
+
"full_attention",
|
| 15 |
+
"full_attention",
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention"
|
| 41 |
+
],
|
| 42 |
+
"max_position_embeddings": 131072,
|
| 43 |
+
"max_window_layers": 28,
|
| 44 |
+
"model_type": "qwen2",
|
| 45 |
+
"num_attention_heads": 28,
|
| 46 |
+
"num_hidden_layers": 28,
|
| 47 |
+
"num_key_value_heads": 4,
|
| 48 |
+
"pad_token_id": 151643,
|
| 49 |
+
"rms_norm_eps": 1e-06,
|
| 50 |
+
"rope_scaling": null,
|
| 51 |
+
"rope_theta": 1000000.0,
|
| 52 |
+
"sliding_window": null,
|
| 53 |
+
"tie_word_embeddings": false,
|
| 54 |
+
"torch_dtype": "bfloat16",
|
| 55 |
+
"transformers_version": "4.55.2",
|
| 56 |
+
"use_cache": false,
|
| 57 |
+
"use_mrope": false,
|
| 58 |
+
"use_sliding_window": false,
|
| 59 |
+
"vocab_size": 152064
|
| 60 |
+
}
|
qwen2.5-7b-base-1500q-1500q-old-click-4ep/checkpoint-624/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"eos_token_id": [
|
| 4 |
+
151643,
|
| 5 |
+
151645
|
| 6 |
+
],
|
| 7 |
+
"max_new_tokens": 2048,
|
| 8 |
+
"transformers_version": "4.55.2"
|
| 9 |
+
}
|