hongzhuyi commited on
Commit
76133c8
·
verified ·
1 Parent(s): f159cd4

Upload folder using huggingface_hub

Browse files
Files changed (40) hide show
  1. .gitattributes +1 -0
  2. args.json +380 -0
  3. checkpoint-771/added_tokens.json +24 -0
  4. checkpoint-771/args.json +380 -0
  5. checkpoint-771/chat_template.jinja +54 -0
  6. checkpoint-771/config.json +59 -0
  7. checkpoint-771/generation_config.json +14 -0
  8. checkpoint-771/latest +1 -0
  9. checkpoint-771/merges.txt +0 -0
  10. checkpoint-771/model-00001-of-00004.safetensors +3 -0
  11. checkpoint-771/model-00002-of-00004.safetensors +3 -0
  12. checkpoint-771/model-00003-of-00004.safetensors +3 -0
  13. checkpoint-771/model-00004-of-00004.safetensors +3 -0
  14. checkpoint-771/model.safetensors.index.json +347 -0
  15. checkpoint-771/scheduler.pt +3 -0
  16. checkpoint-771/special_tokens_map.json +31 -0
  17. checkpoint-771/tokenizer.json +3 -0
  18. checkpoint-771/tokenizer_config.json +207 -0
  19. checkpoint-771/trainer_state.json +0 -0
  20. checkpoint-771/training_args.bin +3 -0
  21. checkpoint-771/vocab.json +0 -0
  22. checkpoint-771/zero_to_fp32.py +760 -0
  23. images/eval_loss.png +0 -0
  24. images/eval_runtime.png +0 -0
  25. images/eval_samples_per_second.png +0 -0
  26. images/eval_steps_per_second.png +0 -0
  27. images/eval_token_acc.png +0 -0
  28. images/train_epoch.png +0 -0
  29. images/train_grad_norm.png +0 -0
  30. images/train_learning_rate.png +0 -0
  31. images/train_loss.png +0 -0
  32. images/train_token_acc.png +0 -0
  33. images/train_total_flos.png +0 -0
  34. images/train_train_loss.png +0 -0
  35. images/train_train_runtime.png +0 -0
  36. images/train_train_samples_per_second.png +0 -0
  37. images/train_train_steps_per_second.png +0 -0
  38. logging.jsonl +0 -0
  39. runs/events.out.tfevents.1755435744.TENCENT64.site.1299023.0 +3 -0
  40. val_dataset.jsonl +12 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-771/tokenizer.json filter=lfs diff=lfs merge=lfs -text
args.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 1e-06,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 1,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": null,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": 20.0,
65
+ "dataloader_num_workers": 48,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": false,
73
+ "metric_for_best_model": "loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch_fused",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "tensorboard"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "check_model": true,
186
+ "acc_strategy": "token",
187
+ "train_dataloader_shuffle": true,
188
+ "max_epochs": null,
189
+ "aligner_lr": null,
190
+ "vit_lr": null,
191
+ "use_logits_to_keep": null,
192
+ "channels": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "Qwen/Qwen2.5-7B-Instruct",
206
+ "model_type": "qwen2_5",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "qwen2_5",
221
+ "system": null,
222
+ "max_length": 16240,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "corr_nq_1500q_hotpot_1500q_new600q_swift.jsonl"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.001,
239
+ "dataset_num_proc": 100,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": false,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [],
287
+ "freeze_parameters_regex": null,
288
+ "freeze_parameters_ratio": 0.0,
289
+ "trainable_parameters": [],
290
+ "trainable_parameters_regex": null,
291
+ "freeze_llm": false,
292
+ "freeze_vit": true,
293
+ "freeze_aligner": false,
294
+ "target_modules": [
295
+ "all-linear"
296
+ ],
297
+ "target_regex": null,
298
+ "modules_to_save": [],
299
+ "lora_rank": 8,
300
+ "lora_alpha": 32,
301
+ "lora_dropout": 0.05,
302
+ "lora_bias": "none",
303
+ "lora_dtype": null,
304
+ "lorap_lr_ratio": null,
305
+ "use_rslora": false,
306
+ "use_dora": false,
307
+ "lora_ga_batch_size": 2,
308
+ "lora_ga_iters": 2,
309
+ "lora_ga_max_length": 1024,
310
+ "lora_ga_direction": "ArB2r",
311
+ "lora_ga_scale": "stable",
312
+ "lora_ga_stable_gamma": 16,
313
+ "init_weights": true,
314
+ "fourier_n_frequency": 2000,
315
+ "fourier_scaling": 300.0,
316
+ "boft_block_size": 4,
317
+ "boft_block_num": 0,
318
+ "boft_n_butterfly_factor": 1,
319
+ "boft_dropout": 0.0,
320
+ "vera_rank": 256,
321
+ "vera_projection_prng_key": 0,
322
+ "vera_dropout": 0.0,
323
+ "vera_d_initial": 0.1,
324
+ "adapter_act": "gelu",
325
+ "adapter_length": 128,
326
+ "use_galore": false,
327
+ "galore_target_modules": null,
328
+ "galore_rank": 128,
329
+ "galore_update_proj_gap": 50,
330
+ "galore_scale": 1.0,
331
+ "galore_proj_type": "std",
332
+ "galore_optim_per_parameter": false,
333
+ "galore_with_embedding": false,
334
+ "galore_quantization": false,
335
+ "galore_proj_quant": false,
336
+ "galore_proj_bits": 4,
337
+ "galore_proj_group_size": 256,
338
+ "galore_cos_threshold": 0.4,
339
+ "galore_gamma_proj": 2,
340
+ "galore_queue_size": 5,
341
+ "adalora_target_r": 8,
342
+ "adalora_init_r": 12,
343
+ "adalora_tinit": 0,
344
+ "adalora_tfinal": 0,
345
+ "adalora_deltaT": 1,
346
+ "adalora_beta1": 0.85,
347
+ "adalora_beta2": 0.85,
348
+ "adalora_orth_reg_weight": 0.5,
349
+ "llamapro_num_new_blocks": 4,
350
+ "llamapro_num_groups": null,
351
+ "lisa_activated_layers": 0,
352
+ "lisa_step_interval": 20,
353
+ "reft_layer_key": null,
354
+ "reft_layers": null,
355
+ "reft_rank": 4,
356
+ "reft_intervention_type": "LoreftIntervention",
357
+ "reft_args": null,
358
+ "swanlab_token": null,
359
+ "swanlab_project": null,
360
+ "swanlab_workspace": null,
361
+ "swanlab_exp_name": null,
362
+ "swanlab_lark_webhook_url": null,
363
+ "swanlab_lark_secret": null,
364
+ "swanlab_mode": "cloud",
365
+ "add_version": true,
366
+ "create_checkpoint_symlink": false,
367
+ "zero_hpz_partition_size": null,
368
+ "deepspeed_autotp_size": null,
369
+ "early_stop_interval": null,
370
+ "rank": 0,
371
+ "global_world_size": 8,
372
+ "local_world_size": 8,
373
+ "model_suffix": "Qwen2.5-7B-Instruct",
374
+ "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
375
+ "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7ff580be9bd0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
376
+ "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct",
377
+ "hub": "<class 'swift.hub.hub.MSHub'>",
378
+ "evaluation_strategy": "epoch",
379
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
380
+ }
checkpoint-771/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-771/args.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 1e-06,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 1,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": null,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": 20.0,
65
+ "dataloader_num_workers": 48,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": false,
73
+ "metric_for_best_model": "loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch_fused",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "tensorboard"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "check_model": true,
186
+ "acc_strategy": "token",
187
+ "train_dataloader_shuffle": true,
188
+ "max_epochs": null,
189
+ "aligner_lr": null,
190
+ "vit_lr": null,
191
+ "use_logits_to_keep": null,
192
+ "channels": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "Qwen/Qwen2.5-7B-Instruct",
206
+ "model_type": "qwen2_5",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "qwen2_5",
221
+ "system": null,
222
+ "max_length": 16240,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "corr_nq_1500q_hotpot_1500q_new600q_swift.jsonl"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.001,
239
+ "dataset_num_proc": 100,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": false,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [],
287
+ "freeze_parameters_regex": null,
288
+ "freeze_parameters_ratio": 0.0,
289
+ "trainable_parameters": [],
290
+ "trainable_parameters_regex": null,
291
+ "freeze_llm": false,
292
+ "freeze_vit": true,
293
+ "freeze_aligner": false,
294
+ "target_modules": [
295
+ "all-linear"
296
+ ],
297
+ "target_regex": null,
298
+ "modules_to_save": [],
299
+ "lora_rank": 8,
300
+ "lora_alpha": 32,
301
+ "lora_dropout": 0.05,
302
+ "lora_bias": "none",
303
+ "lora_dtype": null,
304
+ "lorap_lr_ratio": null,
305
+ "use_rslora": false,
306
+ "use_dora": false,
307
+ "lora_ga_batch_size": 2,
308
+ "lora_ga_iters": 2,
309
+ "lora_ga_max_length": 1024,
310
+ "lora_ga_direction": "ArB2r",
311
+ "lora_ga_scale": "stable",
312
+ "lora_ga_stable_gamma": 16,
313
+ "init_weights": true,
314
+ "fourier_n_frequency": 2000,
315
+ "fourier_scaling": 300.0,
316
+ "boft_block_size": 4,
317
+ "boft_block_num": 0,
318
+ "boft_n_butterfly_factor": 1,
319
+ "boft_dropout": 0.0,
320
+ "vera_rank": 256,
321
+ "vera_projection_prng_key": 0,
322
+ "vera_dropout": 0.0,
323
+ "vera_d_initial": 0.1,
324
+ "adapter_act": "gelu",
325
+ "adapter_length": 128,
326
+ "use_galore": false,
327
+ "galore_target_modules": null,
328
+ "galore_rank": 128,
329
+ "galore_update_proj_gap": 50,
330
+ "galore_scale": 1.0,
331
+ "galore_proj_type": "std",
332
+ "galore_optim_per_parameter": false,
333
+ "galore_with_embedding": false,
334
+ "galore_quantization": false,
335
+ "galore_proj_quant": false,
336
+ "galore_proj_bits": 4,
337
+ "galore_proj_group_size": 256,
338
+ "galore_cos_threshold": 0.4,
339
+ "galore_gamma_proj": 2,
340
+ "galore_queue_size": 5,
341
+ "adalora_target_r": 8,
342
+ "adalora_init_r": 12,
343
+ "adalora_tinit": 0,
344
+ "adalora_tfinal": 0,
345
+ "adalora_deltaT": 1,
346
+ "adalora_beta1": 0.85,
347
+ "adalora_beta2": 0.85,
348
+ "adalora_orth_reg_weight": 0.5,
349
+ "llamapro_num_new_blocks": 4,
350
+ "llamapro_num_groups": null,
351
+ "lisa_activated_layers": 0,
352
+ "lisa_step_interval": 20,
353
+ "reft_layer_key": null,
354
+ "reft_layers": null,
355
+ "reft_rank": 4,
356
+ "reft_intervention_type": "LoreftIntervention",
357
+ "reft_args": null,
358
+ "swanlab_token": null,
359
+ "swanlab_project": null,
360
+ "swanlab_workspace": null,
361
+ "swanlab_exp_name": null,
362
+ "swanlab_lark_webhook_url": null,
363
+ "swanlab_lark_secret": null,
364
+ "swanlab_mode": "cloud",
365
+ "add_version": true,
366
+ "create_checkpoint_symlink": false,
367
+ "zero_hpz_partition_size": null,
368
+ "deepspeed_autotp_size": null,
369
+ "early_stop_interval": null,
370
+ "rank": 0,
371
+ "global_world_size": 8,
372
+ "local_world_size": 8,
373
+ "model_suffix": "Qwen2.5-7B-Instruct",
374
+ "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
375
+ "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7ff580be9bd0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
376
+ "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct",
377
+ "hub": "<class 'swift.hub.hub.MSHub'>",
378
+ "evaluation_strategy": "epoch",
379
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v3-20250817-210120', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
380
+ }
checkpoint-771/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-771/config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 28,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 4,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": false,
54
+ "torch_dtype": "bfloat16",
55
+ "transformers_version": "4.55.2",
56
+ "use_cache": false,
57
+ "use_sliding_window": false,
58
+ "vocab_size": 152064
59
+ }
checkpoint-771/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.55.2"
14
+ }
checkpoint-771/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step771
checkpoint-771/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-771/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eeff26ce995a69a7d1f19a86a82e0709c8b6b6b1cdd4e35804709b2b0b75390
3
+ size 4877660776
checkpoint-771/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4d5a15c844a697751a6125125cf72e2179be41882169950c3ca0e7bfeac735f
3
+ size 4932751008
checkpoint-771/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74a3f73af510ded9f803f10bbff8d91f8242e4132998266e268eb3252d8d621
3
+ size 4330865200
checkpoint-771/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcc2c7684dcbd1ad9240da9b018e0212b129ed383ff8138339bc89ff6ad4d6c1
3
+ size 1089994880
checkpoint-771/model.safetensors.index.json ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 333312,
4
+ "total_size": 15231233024
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "model.norm.weight": "model-00003-of-00004.safetensors"
346
+ }
347
+ }
checkpoint-771/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac216ce5882fe3cdc904af4689aae881c29a3392f3bd1f3bc5595fc945e32fa0
3
+ size 1465
checkpoint-771/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-771/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-771/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-771/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-771/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76e578534622bdf60af3143e006146191dd05b0e9d6e17286040c289f9aba69c
3
+ size 9041
checkpoint-771/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-771/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
images/eval_loss.png ADDED
images/eval_runtime.png ADDED
images/eval_samples_per_second.png ADDED
images/eval_steps_per_second.png ADDED
images/eval_token_acc.png ADDED
images/train_epoch.png ADDED
images/train_grad_norm.png ADDED
images/train_learning_rate.png ADDED
images/train_loss.png ADDED
images/train_token_acc.png ADDED
images/train_total_flos.png ADDED
images/train_train_loss.png ADDED
images/train_train_runtime.png ADDED
images/train_train_samples_per_second.png ADDED
images/train_train_steps_per_second.png ADDED
logging.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
runs/events.out.tfevents.1755435744.TENCENT64.site.1299023.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0915247469a6ab19832c46dfe34151a2421d722ce04a5ae023181dff244f56c6
3
+ size 212504
val_dataset.jsonl ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\nInformation already found: Information related to the current query that has been identified in historical actions. You need to integrate and supplement this information.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n9.Between <think></think>, you need to use <conclusion></conclusion> to enclose the information obtained in this round that is relevant to the current query. Note that if there is no valid information, this part is not required. The enclosed information must be directly usable to answer the original query."}, {"role": "user", "content": "\nObjective: The Estadio El Pocito is a multi-use stadium located in Querétaro City, is the capital and largest city of the state of Querétaro, located in the central portion of which country?\nObservation: [2085] RootWebArea 'Estadio El Pocito' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Estadio_El_Pocito\n\t[2147] textbox \"Search 'Wikipedia'\" required: False\n\t[2153] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[2154] button '🏠'\n\t[2155] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[2156] button 'Wikipedia'\n\t[2157] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[2158] button '🎲'\n\t[2165] heading 'Estadio El Pocito' hasPopup: menu\n\t[2061] StaticText 'The Estadio El Pocito is a multi-use '\n\t[2170] link 'stadium' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stadium\n\t[2065] StaticText ' located in '\n\t[2171] link 'Querétaro City' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Quer%C3%A9taro_City\n\t[2172] link 'Querétaro' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Quer%C3%A9taro\n\t[2069] StaticText '. It is currently used mostly for '\n\t[2173] link 'American football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/American_football\n\t[2071] StaticText ' matches. The stadium has a capacity of 4,000 people.'\n\t[2174] group ''\n\t\t[2179] DisclosureTriangle 'References' expanded: True\n\t\t\t[2180] heading 'References'\n\t[2074] StaticText 'This article is issued from '\n\t[2086] link 'Wikipedia' url: https://en.wikipedia.org/wiki/?title=Estadio_El_Pocito&oldid=859352096\n\t[2076] StaticText '. The text is licensed under '\n\t[2087] link 'Creative Commons - Attribution - Sharealike' url: https://creativecommons.org/licenses/by-sa/4.0/\n\t[2078] StaticText '. Additional terms may apply for the media files.'\nHISTORY_ACTION: \ntype [328] [Querétaro City] [1]\ntype [1049] [Querétaro] [1]\ntype [1182] [Estadio El Pocito] [1]\nclick [1590] [Estadio El Pocito]\nHISTORY_info: \n"}, {"role": "assistant", "content": "<think>The objective is to determine the country in which Querétaro City is located. The page mentions that Querétaro City is the capital and largest city of the state of Querétaro. There is a link to \"Querétaro City\" which likely leads to a page with more detailed information about the city, possibly including the country. Clicking this link should provide the necessary information to answer the question.</think>\n```click [2171] [Querétaro City]```"}]}
2
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: fire and fury inside the trump white house how many pages\nObservation: [1783] RootWebArea 'Fire and Fury' focused: True\n\t[2267] textbox \"Search 'Wikipedia'\" required: False\n\t[2269] link 'Go to welcome page'\n\t\t[2275] button '🏠'\n\t[2270] link \"Go to the main page of 'Wikipedia'\"\n\t\t[2277] button 'Wikipedia'\n\t[2271] link 'Go to a randomly selected page'\n\t\t[2279] button '🎲'\n\t[2281] heading 'Fire and Fury' hasPopup: menu\n\t[2298] StaticText 'Fire and Fury: Inside the Trump White House is a 2018 book by journalist '\n\t[2300] link 'Michael Wolff'\n\t[2301] StaticText ' which according to Wolff, details the behavior of '\n\t[2302] link 'U.S. President'\n\t[2304] link 'Donald Trump'\n\t[2305] StaticText ', the staff of his '\n\t[2306] link '2016 presidential campaign'\n\t[2307] StaticText ', and the '\n\t[2308] link 'White House'\n\t[2309] StaticText ' staff. The title refers to a quote by Trump about the '\n\t[2310] link 'conflict with North Korea'\n\t[2311] StaticText '. The book became a '\n\t[2313] link 'New York Times'\n\t[2314] StaticText ' number one '\n\t[2315] link 'bestseller'\n\t[2287] note ''\n\t\t[2317] StaticText 'This article is about the 2018 book. For other uses, see '\n\t\t[2318] link 'Fire and Fury (disambiguation)'\n\t[2289] table 'Fire and Fury: Inside the Trump White House'\n\t\t[2320] caption ''\n\t\t[3170] row ''\n\t\t\t[3181] gridcell 'First edition cover' required: False\n\t\t[3171] row ''\n\t\t\t[3182] rowheader 'Author' required: False\n\t\t\t[3183] gridcell 'Michael Wolff' required: False\n\t\t\t\t[3506] link 'Michael Wolff'\n\t[2324] link 'Steve Bannon'\n\t[2325] StaticText '. Trump is depicted as being held in low regard by his White House staff, leading Wolff to claim that \"100% of the people around him\" believe Trump is unfit for office.'\n\t[1809] link '[]'\n\t[1811] Details ''\n\t\t[2080] DisclosureTriangle 'Background' expanded: True\n\t\t\t[2326] heading 'Background'\n\t\t[2329] StaticText 'According to '\n\t\t[2330] link 'Michael Wolff'\n\t\t[2331] StaticText ', when he approached '\n\t\t[2332] link 'Donald Trump'\n\t\t[2333] StaticText ' about writing a book on his '\n\t\t[2334] link 'presidency'\n\t\t[2335] StaticText ', Trump agreed to give him access to the '\n\t\t[2336] link 'White House'\n\t\t[2337] StaticText ' because he liked an article Wolff wrote about him in June 2016 for '\n\t\t[2339] link 'The Hollywood Reporter'\n\t\t[1815] link '[]'\n\t\t[2341] StaticText ' However, Trump later claimed that he had never authorized access for Wolff and never spoke to him for the book.'\n\t\t[1818] link '[]'\n\t\t[2344] link 'West Wing'\n\t\t[2345] StaticText ' of the White House, conducting research for his book through interviews and as a \"'\n\t\t[2346] link 'fly on the wall'\n\t\t[2347] StaticText '\" observer. He said he conducted over 200 interviews with Trump and his associates including the senior staff'\nHISTORY_ACTION: type [21] [Fire and Fury: Inside the Trump White House] [1]\nclick [570] [link 'Fire and Fury']\nclick [2289] [table 'Fire and Fury: Inside the Trump White House']\n"}, {"role": "assistant", "content": "<think>The table with id 2289 may contain publication details such as the number of pages. I need to examine the contents of this table further, possibly by scrolling down to reveal more information.</think>\n```scroll [down]```"}]}
3
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\nInformation already found: Information related to the current query that has been identified in historical actions. You need to integrate and supplement this information.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n9.Between <think></think>, you need to use <conclusion></conclusion> to enclose the information obtained in this round that is relevant to the current query. Note that if there is no valid information, this part is not required. The enclosed information must be directly usable to answer the original query."}, {"role": "user", "content": "\nObjective: Are both Mariqueen Maandig and Pelle Almqvist from the same country?\nObservation: [1273] RootWebArea 'Search: Mariqueen Maandig' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=Mariqueen+Maandig\n\t[1320] textbox \"Search 'Wikipedia'\" required: False\n\t[1326] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1327] button '🏠'\n\t[1328] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1329] button 'Wikipedia'\n\t[1330] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1331] button '🎲'\n\t[1025] StaticText 'Results 1-25 of 142 for \"Mariqueen Maandig\"'\n\t[1339] link 'Mariqueen Maandig' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mariqueen_Maandig\n\t[1032] StaticText 'Mariqueen Maandig Mariqueen Maandig Reznor (née Maandig; born April 5, 1981)[1] is an American singer, songwriter, and musician. She is the vocalist for How to Destroy Angels, and was the former vocalist of Los Angeles-based rock band West Indian Girl. Mariqueen Maandig Maandig performing in May 2013 Background information Birth name Mariqueen Arrabaca Maandig Born (1981-04-05) April 5, 1981 Genres Alternative rock post-industrial Occupation(s) Singer songwriter musician Instruments Vocals, bass......'\n\t[1052] StaticText 'from Wikipedia'\n\t[1053] StaticText '528 words'\n\t[1354] link 'An Omen EP' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/An_Omen_EP\n\t[1065] StaticText '1,026 words'\n\t[1363] link 'How to Destroy Angels (band)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/How_to_Destroy_Angels_(band)\n\t[1075] StaticText 'from Wikipedia'\n\t[1076] StaticText '1,613 words'\n\t[1372] link 'West Indian Girl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/West_Indian_Girl\n\t[1084] StaticText '445 words'\n\t[1379] link 'Ice Age (song)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ice_Age_(song)\n\t[1086] StaticText '...(s) Mariqueen Maandig Trent Reznor Atticus Ross Rob Sheridan Producer(s) How to Destroy Angels Music video \"Ice Age\" on YouTube Music video \"Ice Age\" is one of three songs from An Omen EP with a music video (the other two being \"Keep It Together\" and \"The Loop Closes\"). It was directed by John Hillcoat, unlike the other two videos which were directed by the group themselves. The music video for the song premiered on November 28, 2012 through How to Destroy Angels\\' VEVO channel on YouTube. It......'\n\t[1091] StaticText 'from Wikipedia'\n\t[1092] StaticText '454 words'\n\t[1386] link '4th & Wall' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/4th_&_Wall\n\t[1094] StaticText '...Mariqueen Maandig on vocals, Mark Lewis on drums and Nathan Van Hala and Amy White on keyboards. Track listing No. Title Length 1. \"To Die In L.A.\" 4:09 2. \"Blue Wave\" 4:14 3. \"Sofia\" 5:59 4. \"All My Friends\" 4:36 5. \"Indian Ocean\" 5:00 6. \"Up The Coast\" 4:31 7. \"Solar Eyes\" 7:53 8. \"Lost Children\" 3:48 9. \"Back To You\" 4:22 10. \"Rise From The Dead\" 5:44 11. \"Get Up\" 3:37 Personnel Robert James\\xa0– vocals, guitar Francis Ten\\xa0– bass '\nHISTORY_ACTION: \ntype [329] [Mariqueen Maandig] [1]\nHISTORY_info: \n"}, {"role": "assistant", "content": "<think>The objective is to determine if Mariqueen Maandig and Pelle Almqvist are from the same country. From the current page, I have found that Mariqueen Maandig is described as an \"American singer, songwriter, and musician,\" which means she is from the United States. I now need to search for Pelle Almqvist's nationality to compare. The search box with id [1320] can be used for this purpose.\n<conclusion>Mariqueen Maandig is American (from the United States).</conclusion></think>\n```type [1320] [Pelle Almqvist] [1]```"}]}
4
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who does the voice for the elephant in sing\nObservation: [511] RootWebArea 'Search: Sing (film)' focused: True\n\t[679] textbox \"Search 'Wikipedia'\" required: False\n\t[681] link 'Go to welcome page'\n\t\t[1211] button '🏠'\n\t[682] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1213] button 'Wikipedia'\n\t[683] link 'Go to a randomly selected page'\n\t\t[1215] button '🎲'\n\t[524] StaticText 'Results 1-25 of 90,000 for \"Sing (film)\"'\n\t[570] link 'Sing (2016 American film)'\n\t[685] StaticText '...film focuses on a struggling theater owner who holds a singing competition to save his theatre from getting closed down, as well as how it interferes with the personal lives of its contestants. This article is about the film. For the same name of the franchise, see Sing (franchise). For the same name of the soundtrack, see Sing: Original Motion Picture Soundtrack. Sing Theatrical release poster Directed by Garth Jennings Written by Garth Jennings Produced by Chris Meledandri Janet Healy Starring......'\n\t[701] StaticText 'from Wikipedia'\n\t[702] StaticText '3,466 words'\n\t[574] link 'Sing Street'\n\t[705] StaticText 'Sing Street Sing Street is a 2016 musical coming-of-age comedy-drama film written and directed by John Carney from a story by Carney and Simon Carmody. Starring Ferdia Walsh-Peelo, Lucy Boynton, Maria Doyle Kennedy, Aidan Gillen, Jack Reynor, and Kelly Thornton, the story revolves around a boy starting a band to impress a girl in 1980s Ireland. It is an international co-production between producers from Ireland, the United Kingdom and the United States. For the stage adaptation, see Sing Street......'\n\t[716] StaticText 'from Wikipedia'\n\t[717] StaticText '3,383 words'\n\t[578] link 'Sing Boy Sing (film)'\n\t[747] StaticText '1,040 words'\n\t[582] link 'Bulbul Can Sing'\n\t[783] StaticText 'from Wikipedia'\n\t[784] StaticText '1,109 words'\n\t[586] link \"I've Heard the Mermaids Singing\"\n\t[797] StaticText '5,782 words'\n\t[590] link 'Everybody Sing (1938 film)'\nHISTORY_ACTION: type [21] [Sing (film)] [1]\n"}, {"role": "assistant", "content": "<think>The link labeled 'Sing (2016 American film)' is likely the main article about the movie \"Sing\" and should contain information about the cast, including the voice actor for the elephant character. Clicking this link should help me find the answer.</think>\n```click [570] [link 'Sing (2016 American film)']```"}]}
5
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: In what city is Macquarie Media broadcasting's 3AW talkback radio station located?\nObservation: [1301] RootWebArea 'Search: 3AW' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=3AW\n\t[1322] textbox \"Search 'Wikipedia'\" required: False\n\t[1328] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1329] button '🏠'\n\t[1330] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1331] button 'Wikipedia'\n\t[1332] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1333] button '🎲'\n\t[1033] StaticText 'Results 1-25 of 631 for \"3AW\"'\n\t[1341] link '3AW' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/3AW\n\t[1048] StaticText 'from Wikipedia'\n\t[1049] StaticText '2,569 words'\n\t[1350] link '3AW Breakfast' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/3AW_Breakfast\n\t[1062] StaticText '512 words'\n\t[1360] link 'Darren James (broadcaster)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Darren_James_(broadcaster)\n\t[1071] StaticText 'from Wikipedia'\n\t[1072] StaticText '773 words'\n\t[1368] link 'Tony Leonard' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Tony_Leonard\n\t[1074] StaticText '...3AW he stayed with this team until 2002. He began commentating AFL matches on 3AW in 1996 and he used to present a Saturday morning 3AW show called \"Glossing Over\" with Trevor Marmalade and Dr. Turf, until it was axed on 21 June 2007, following 3AW\\'s poor ratings in the football timeslots.[1] Since 1995, Leonard has appeared every Friday on Neil Mitchell\\'s morning 3AW program in a segment called \"3AW Pub of the Week\", reviewing hotels around Melbourne and surrounding areas.[2] Leonard is a long-......'\n\t[1085] StaticText 'from Wikipedia'\n\t[1086] StaticText '333 words'\n\t[1378] link 'Justin Smith (presenter)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Justin_Smith_(presenter)\n\t[1088] StaticText '...3AW Melbourne, Victoria.[2] He commenced his metropolitan radio career with 3AW and then moved to 2UE, both Fairfax Radio Network stations. 3AW Smith started at 3AW in 2001 and worked on a number of programs before becoming a producer of the high rating talkback radio show, 3AW Mornings with Neil Mitchell. He was appointed the senior producer in 2002 and left that role at the end of 2013, to take up a prime-time on-air role at 2UE.[3][4] He was also the alternate host for around five weeks per......'\n\t[1099] StaticText 'from Wikipedia'\n\t[1100] StaticText '1,158 words'\n\t[1388] link 'Australian Commercial Radio Awards' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Australian_Commercial_Radio_Awards\nHISTORY_ACTION: type [328] [3AW] [1]\n"}, {"role": "assistant", "content": "<think>The search results page shows a link titled \"3AW,\" which is likely the main article about the radio station. Clicking this link should provide detailed information, including its location.</think>\n```click [1341] [3AW]```"}]}
6
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who played the lion in the wizzard of oz\nObservation: [4451] RootWebArea 'Search: The Wizard of Oz film' focused: True\n\t[4619] textbox \"Search 'Wikipedia'\" required: False\n\t[4621] link 'Go to welcome page'\n\t\t[6286] button '🏠'\n\t[4622] link \"Go to the main page of 'Wikipedia'\"\n\t\t[6288] button 'Wikipedia'\n\t[4623] link 'Go to a randomly selected page'\n\t\t[6290] button '🎲'\n\t[4464] StaticText 'Results 1-25 of 5,000 for \"The Wizard of Oz film\"'\n\t[4510] link 'The Wonderful Wizard of Oz (1910 film)'\n\t[4689] StaticText 'from Wikipedia'\n\t[4690] StaticText '2,789 words'\n\t[4514] link 'The Wizard of Oz (1933 film)'\n\t[4693] StaticText 'The Wizard of Oz (1933 film) The Wizard of Oz is a 1933 Canadian-American animated short film directed by Ted Eshbaugh.[2] The story is credited to \"Col. Frank Baum.\" Frank Joslyn Baum, a Lieutenant Colonel in the United States Army and eldest son of writer L. Frank Baum, was involved in the film\\'s production, and may have had an involvement in the film\\'s script, which is loosely inspired by the elder Baum\\'s 1900 novel, The Wonderful Wizard of Oz. It runs approximately eight and a half minutes......'\n\t[4752] StaticText 'from Wikipedia'\n\t[4753] StaticText '1,775 words'\n\t[4518] link 'The Wizard of Oz (1982 film)'\n\t[4819] StaticText '1,763 words'\n\t[4522] link 'The Wonderful Wizard of Oz (1975 film)'\n\t[4890] StaticText 'from Wikipedia'\n\t[4891] StaticText '1,135 words'\n\t[4526] link 'The Wizard of Oz (1939 film)'\n\t[4954] StaticText '15,483 words'\n\t[4530] link 'The Wizard of Oz (1925 film)'\nHISTORY_ACTION: type [21] [The Wizard of Oz] [1]\nclick [570] [link 'The Wizard of Oz (pinball)']\ntype [2667] [The Wizard of Oz film] [1]\n"}, {"role": "assistant", "content": "<think>The user wants to know who played the lion in \"The Wizard of Oz\". The most famous version is the 1939 film. There is a link labeled 'The Wizard of Oz (1939 film)' ([4526]). Clicking this link should provide detailed information about the cast.</think>\n```click [4526] [link 'The Wizard of Oz (1939 film)']```"}]}
7
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: what is the range of a key fob\nObservation: [2108] RootWebArea 'Remote keyless system' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Remote_keyless_system\n\t[2172] textbox \"Search 'Wikipedia'\" required: False\n\t[2178] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[2179] button '🏠'\n\t[2180] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[2181] button 'Wikipedia'\n\t[2182] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[2183] button '🎲'\n\t[2212] group ''\n\t\t[1847] StaticText 'The remote keyless systems using a handheld transmitter first began appearing on the French made '\n\t\t[2230] link 'Renault Fuego' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Renault_Fuego\n\t\t[1849] StaticText ' in 1982,'\n\t\t[2232] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Remote_keyless_system#cite_note-2\n\t\t[1850] StaticText ' and as an option on several '\n\t\t[2234] link 'American Motors' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/American_Motors\n\t\t[1852] StaticText ' vehicles in 1983, including the '\n\t\t[2235] link 'Renault Alliance' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Renault_Alliance\n\t\t[1854] StaticText '. The feature gained its first widespread availability in the U.S. on several '\n\t\t[2236] link 'General Motors' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/General_Motors\n\t\t[1856] StaticText ' vehicles in 1989.'\n\t[2237] group ''\n\t\t[2242] DisclosureTriangle 'Function' expanded: True\n\t\t\t[2243] heading 'Function'\n\t\t[1859] StaticText 'Keyless remotes contain a short-range '\n\t\t[2245] link 'radio transmitter' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Radio_transmitter\n\t\t[1861] StaticText ', and must be within a certain range, usually 5–20 meters, of the car to work. When a button is pushed, it sends a coded signal by '\n\t\t[2246] link 'radio waves' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Radio_wave\n\t\t[1863] StaticText ' to a receiver unit in the car, which locks or unlocks the door. Most RKEs operate at a '\n\t\t[2247] link 'frequency' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Frequency\n\t\t[1865] StaticText ' of 315\\xa0MHz for North America-made cars and at '\n\t\t[2249] link '433.92\\xa0MHz' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/LPD433\n\t\t[1871] StaticText ' for European, Japanese and Asian cars. Modern systems since the mid-1990s implement '\n\t\t[2251] link 'encryption' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Encryption\n\t\t[1873] StaticText ' as well as rotating entry codes to prevent car thieves from intercepting and '\n\t\t[2252] link 'spoofing' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Spoofing_attack\n\t\t[1875] StaticText ' the signal.'\n\t\t[2254] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Remote_keyless_system#cite_note-3\n\t\t[1876] StaticText ' Earlier systems used infrared instead of radio signals to unlock the vehicle, such as systems found on Mercedes-Benz,'\n\t\t[2257] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Remote_keyless_system#cite_note-4\n\t\t[2260] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Remote_keyless_system#cite_note-5\n\t\t[1878] StaticText ' and other manufacturers.'\n\t\t[1880] StaticText 'The functions of a remote keyless entry system are contained on a '\n\t\t[2264] link 'key fob' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Key_fob\n\t\t[1882] StaticText ' or built into the '\n\t\t[2265] link 'ignition' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ignition_system\n\t\t[1884] StaticText ' key handle itself. Buttons are dedicated to locking or unlocking the doors and opening the trunk or tailgate. On some minivans, the power sliding doors can be opened/closed remotely. Some cars will also close any open '\n\t\t[2266] link 'windows' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Power_window\n\t\t[1886] StaticText ' and roof when remotely locking the car. Some remote keyless fobs also feature a red '\n\t\t[2267] link 'panic button' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Panic_button\n\t\t[1888] StaticText ' which activates the '\n\t\t[2268] link 'car alarm' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Car_alarm\n\t\t[1891] StaticText 'On cars where the trunk release is electronically operated, it can be triggered to open by a button on the remote. Conventionally, the trunk springs open with the help of '\nHISTORY_ACTION: type [328] [key fob range] [1]\nclick [1302] [Remote keyless system]\nscroll [down]\n"}, {"role": "assistant", "content": "<think>The relevant information appears in the section under the heading \"Function.\" The text states that keyless remotes must be within a certain range, usually 5–20 meters, of the car to work. This answers the user's question directly.</think>\n```stop [The range of a key fob is usually 5–20 meters.]```"}]}
8
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which University is located in Singapore, Nanyang Technological University or American University of Beirut?\nObservation: [3482] RootWebArea 'Nanyang Technological University' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Technological_University\n\t[3602] textbox \"Search 'Wikipedia'\" required: False\n\t[3608] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[3609] button '🏠'\n\t[3610] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[3611] button 'Wikipedia'\n\t[3612] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[3613] button '🎲'\n\t[3620] heading 'Nanyang Technological University' hasPopup: menu\n\t[1938] StaticText 'The Nanyang Technological University (NTU) is a '\n\t[3626] link 'national' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/National_university\n\t[3627] link 'collegiate' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Collegiate_university\n\t[3628] link 'research university' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Research_university\n\t[3629] link 'Singapore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Singapore\n\t[1950] StaticText '. It is the second oldest '\n\t[3630] link 'autonomous university' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Autonomous_university\n\t[1952] StaticText ' in the country and is considered to be one of the top universities in the world by various international metrics. NTU is consistently ranked within the top 80 universities in the world according to most rankings, and is currently ranked 12th globally in the '\n\t[3631] link 'QS World University Rankings' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/QS_World_University_Rankings\n\t[1954] StaticText ' as of June 2021.'\n\t[3633] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Technological_University#cite_note-QS-NTU-QuickFacts-5\n\t[1955] StaticText ' It has also been ranked 1st amongst young universities by the '\n\t[3635] link 'QS World University Rankings' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/QS_World_University_Rankings\n\t[1957] StaticText ' since 2015 as of April 2021'\n\t[3637] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Technological_University#cite_note-6\n\t[3640] note ''\n\t\t[1958] StaticText 'Not to be confused with '\n\t\t[3641] link 'Nanyang Polytechnic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Polytechnic\n\t\t[3642] link 'Nanyang Institute of Technology' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Institute_of_Technology\n\t[3645] table 'Nanyang Technological University'\n\t\t[3646] caption ''\n\t\t[3648] row ''\n\t\t\t[3649] cell '南洋理工大学\\xa0(Chinese) Universiti Teknologi Nanyang\\xa0\\xa0(Malay) நன்யாங் தொழில்நுட்ப பல்கலைக்கழகம்\\xa0(Tamil)'\n\t\t\t\t[3657] link 'Chinese' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Chinese_language\n\t\t\t\t[3663] link 'Malay' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Malay_language\n\t\t\t\t[1975] StaticText ')நன்யாங் தொழில்நுட்ப பல்கலைக்கழகம்\\xa0('\n\t\t\t\t[3668] link 'Tamil' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Tamil_language\n\t\t[3669] row ''\n\t\t\t[3670] cell ''\n\t\t\t\t[3538] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Nanyang_Technological_University_coat_of_arms_vector.svg.png.webp\n\t\t[3672] row ''\n\t\t\t[3673] rowheader 'Former names' required: False\n\t\t\t[3675] cell 'Nanyang Technological Institute (1981–1991)'\n\t\t[3676] row ''\n\t\t\t[3677] rowheader 'Type' required: False\n\t\t\t[3678] cell 'Autonomous university[]'\n\t\t\t\t[3679] link 'Autonomous university' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Autonomous_university\n\t\t\t\t[3681] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Technological_University#cite_note-MOE-1\n\t\t[3683] row ''\n\t\t\t[3684] rowheader 'Established' required: False\n\t\t\t[3685] cell '1\\xa0August 1981 (Nanyang Technological Institute) 1\\xa0July 1991 (Nanyang Technological University)'\n\t[2060] StaticText 'The university is organised into several colleges and schools, including the College of Engineering, College of Science, Nanyang Business School, '\n\t[3793] link 'Lee Kong Chian School of Medicine' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lee_Kong_Chian_School_of_Medicine\n\t[2062] StaticText ', College of Humanities, Arts and Social Sciences, Graduate College, '\n\t[3794] link 'National Institute of Education' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/National_Institute_of_Education\n\t[2064] StaticText ', and '\n\t[3795] link 'S. Rajaratnam School of International Studies' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/S._Rajaratnam_School_of_International_Studies\n\t[3798] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Nanyang_Technological_University#cite_note-7\n\t[3801] link 'Novena' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Novena%2C_Singapore\n\t[2071] StaticText ' and '\n\t[3802] link 'one-north' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/One-north\n\t[2073] StaticText ' respectively. As a relatively large university, it has 23,951 full-time enrolled students and 3846 full-time teaching staff.'\n\t[3803] group ''\n\t\t[3808] DisclosureTriangle 'History' expanded: True\n\t\t\t[3809] heading 'History'\n\t\t[3816] heading 'Nanyang University (1955–1980)'\nHISTORY_ACTION: type [328] [Nanyang Technological University] [1]\nclick [1375] [Nanyang Technological University]\n"}, {"role": "assistant", "content": "<think>The objective is to determine which university is located in Singapore: Nanyang Technological University or American University of Beirut. From the current page, it is clear that Nanyang Technological University (NTU) is located in Singapore, as indicated by the text and the link to \"Singapore.\" There is no need to check the American University of Beirut, as the answer is already clear from the current page.</think>\n```stop [Nanyang Technological University is located in Singapore.]```"}]}
9
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: how many wars has pakistan fought with india\nObservation: [1764] RootWebArea 'Search: wars between Pakistan and India' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=wars+between+Pakistan+and+India\n\t[1812] textbox \"Search 'Wikipedia'\" required: False\n\t[1818] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1819] button '🏠'\n\t[1820] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1821] button 'Wikipedia'\n\t[1822] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1823] button '🎲'\n\t[1025] StaticText 'Results 1-25 of 30,000 for \"wars between Pakistan and India\"'\n\t[1831] link 'Transport between India and Pakistan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Transport_between_India_and_Pakistan\n\t[1059] StaticText 'from Wikipedia'\n\t[1060] StaticText '1,959 words'\n\t[1849] link 'India–Pakistan relations' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/India%E2%80%93Pakistan_relations\n\t[1062] StaticText '...between the two states have been defined by the violent partition of British India in 1947 which started the Kashmir conflict, and the numerous military conflicts fought between the two nations. Consequently, their relationship has been plagued by hostility and suspicion. Northern India and Pakistan somewhat overlap in certain demographics and shared lingua francas (mainly Punjabi, Sindhi, and Hindustani). India–Pakistan relations Pakistan India Diplomatic mission High Commission of Pakistan......'\n\t[1093] StaticText 'from Wikipedia'\n\t[1094] StaticText '11,733 words'\n\t[1869] link 'Indo-Pakistani wars and conflicts' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Indo-Pakistani_wars_and_conflicts\n\t[1096] StaticText '...between the two states, with the exception of the Indo-Pakistani War of 1971, which occurred as a direct result of hostilities stemming from the Bangladesh Liberation War in erstwhile East Pakistan (now Bangladesh). India–Pakistan conflict Location of India (orange) and Pakistan (green) Date 22 October 1947 – present (74\\xa0years, 6\\xa0months and 6\\xa0days) Location Line of Control, India–Pakistan border Status Ongoing Direct engagements: Wars: First Kashmir War Second Kashmir War Indo-Pakistani......'\n\t[1125] StaticText 'from Wikipedia'\n\t[1126] StaticText '8,428 words'\n\t[1888] link 'Media coverage of 2019 India–Pakistan standoff' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Media_coverage_of_2019_India%E2%80%93Pakistan_standoff\n\t[1128] StaticText '...-Pakistan standoff was criticised for largely being \"jingoistic\" and \"nationalistic\", to the extent of the media war-mongering and the battle being fought between India andPakistan through newsrooms.[1][2][3] During the escalation, fake videos and misinformation were prevalent on the social media which were further reported to escalate tensions between India and Pakistan.[4][5] Once tensions started de-escalating, the media coverage shifted to comparisons being made between \"India and Pakistan\"......'\n\t[1162] StaticText 'from Wikipedia'\n\t[1163] StaticText '2,036 words'\n\t[1910] link 'India–Pakistan border' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/India%E2%80%93Pakistan_border\n\t[1165] StaticText 'India–Pakistan border The India–Pakistan border, known locally as the International Border (IB), is the international boundary that separates India and Pakistan. At its northern end is the Line of Control, which separates India from Pakistani-administered Kashmir, and at its southern end is Sir Creek, a tidal estuary in the Rann of Kutch between the Indian state of Gujarat and the Pakistani province of Sindh.[1] India–Pakistan border Nighttime panorama of the border from outer space......'\n\t[1191] StaticText 'from Wikipedia'\n\t[1192] StaticText '2,168 words'\n\t[1928] link 'Simla Agreement' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Simla_Agreement\n\t[1194] StaticText '...signed between India and Pakistan on 2 July 1972 in Shimla, the capital city of the Indian state of Himachal Pradesh.[3] It followed the Indo-Pakistani War of 1971, which began after India intervened in East Pakistan as an ally of Bengali rebels who were fighting against Pakistani state forces in the Bangladesh Liberation War.[4] The Indian intervention proved decisive in the '\nHISTORY_ACTION: type [328] [wars between Pakistan and India] [1]\n"}, {"role": "assistant", "content": "<think>The link titled 'Indo-Pakistani wars and conflicts' ([1869]) appears to be the most directly relevant to the user's objective of determining how many wars Pakistan has fought with India. Clicking this link should provide a focused article likely listing the wars and their details.</think>\n```click [1869] [Indo-Pakistani wars and conflicts]```"}]}
10
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who played pugsley in the original addams family\nObservation: [913] RootWebArea 'Pugsley Addams' focused: True\n\t[1099] textbox \"Search 'Wikipedia'\" required: False\n\t[1101] link 'Go to welcome page'\n\t\t[1107] button '🏠'\n\t[1102] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1109] button 'Wikipedia'\n\t[1103] link 'Go to a randomly selected page'\n\t\t[1111] button '🎲'\n\t[1113] heading 'Pugsley Addams' hasPopup: menu\n\t[1127] StaticText 'Pugsley Addams is a member of the fictional '\n\t[1129] link 'Addams family'\n\t[1130] StaticText ', created by American cartoonist '\n\t[1131] link 'Charles Addams'\n\t[922] table ''\n\t\t[1574] row ''\n\t\t\t[1584] columnheader 'Pugsley Addams' required: False\n\t\t[1575] row ''\n\t\t\t[1585] gridcell 'The Addams Family character' required: False\n\t\t\t\t[1681] link 'The Addams Family'\n\t\t[1576] row ''\n\t\t\t[1586] gridcell 'Ken Weatherwax as Pugsley in The Addams Family television series' required: False\n\t\t\t\t[1687] link 'Ken Weatherwax'\n\t\t\t\t[1690] link 'The Addams Family'\n\t\t[1577] row ''\n\t\t\t[1587] rowheader 'First appearance' required: False\n\t\t\t[1588] gridcell 'The New Yorker (1938)' required: False\n\t\t\t\t[1696] link 'The New Yorker'\n\t\t[1578] row ''\n\t\t\t[1589] rowheader 'Created by' required: False\n\t\t\t[1590] gridcell 'Charles Addams' required: False\n\t\t\t\t[1700] link 'Charles Addams'\n\t\t[1579] row ''\n\t\t\t[1591] rowheader 'Portrayed by' required: False\n\t\t\t[1592] gridcell 'Ken Weatherwax (1964 series) Jimmy Workman (1991 and 1993 films) Jerry Messing (1998 film) Brody Smith (1998 series)' required: False\n\t\t\t\t[1703] link 'Ken Weatherwax'\n\t\t\t\t[1706] link 'Jimmy Workman'\n\t\t\t\t[1709] link 'Jerry Messing'\n\t\t\t\t[1710] StaticText ' (1998 film)Brody Smith (1998 series)'\n\t\t[1580] row ''\n\t\t\t[1593] rowheader 'Voiced by' required: False\n\t\t\t[1594] gridcell 'Jodie Foster (1973 animated series) Jeannie Elias (1992 animated series) Finn Wolfhard (2019 film) Javon Walton (2021 film)' required: False\n\t\t\t\t[1717] link 'Jodie Foster'\n\t\t\t\t[1719] link '1973 animated series'\n\t\t\t\t[1722] link 'Jeannie Elias'\n\t\t\t\t[1724] link '1992 animated series'\n\t\t\t\t[1727] link 'Finn Wolfhard'\n\t\t\t\t[1729] link '2019 film'\n\t[1026] Details ''\n\t\t[1028] DisclosureTriangle 'Cartoons' expanded: True\n\t\t\t[1133] heading 'Cartoons'\n\t[929] Details ''\n\t\t[1031] DisclosureTriangle 'TV shows' expanded: True\n\t\t\t[1137] heading 'TV shows'\n\t\t[1147] StaticText 'Pugsley is the oldest child of '\n\t\t[1148] link 'Morticia'\n\t\t[1149] StaticText ' and '\n\t\t[1150] link 'Gomez Addams'\n\t\t[1151] StaticText '. He is a young boy (his age is given as eight years old in the pilot episode but is changed to 10 years old in the second episode) who is almost always seen wearing a striped '\n\t\t[1152] link 'T-shirt'\n\t\t[1153] StaticText ' and shorts. Originally unnamed (as were all of the family members), the character who was apparently the prototype for Pugsley first appeared in the '\n\t\t[1154] link 'Charles Addams'\n\t\t[1155] StaticText ' cartoons in '\n\t\t[1157] link 'The New Yorker'\n\t\t[1158] StaticText ' during the 1930s. In this first incarnation, he was portrayed as a deviant child with a vicious nature, shown committing deplorable acts with his sister. In all incarnations, he is overweight. In the original series, Pugsley is 1.42 metres tall and weighs 112 pounds. In the television series, Pugsley usually eats over five pieces of cake at birthday parties.'\n\t\t[931] blockquote ''\n\t\t\t[1609] StaticText 'An energetic monster of a boy…blond red hair, popped blue eyes and a dedicated troublemaker, in other words, the kid next door…genius in his own way, he makes toy '\n\t\t\t[1610] link 'guillotines'\n\t\t\t[1611] StaticText ', full size '\nHISTORY_ACTION: type [21] [Pugsley Addams Family] [1]\nclick [568] [link 'Pugsley Addams']\n"}, {"role": "assistant", "content": "<think>From the table, under the rowheader 'Portrayed by', the gridcell lists \"Ken Weatherwax (1964 series)\" as the actor for Pugsley in the original Addams Family television series. This matches the user's query about the original portrayal. I can now provide the answer.</think>\n```stop [Ken Weatherwax]```"}]}
11
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\nInformation already found: Information related to the current query that has been identified in historical actions. You need to integrate and supplement this information.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n9.Between <think></think>, you need to use <conclusion></conclusion> to enclose the information obtained in this round that is relevant to the current query. Note that if there is no valid information, this part is not required. The enclosed information must be directly usable to answer the original query."}, {"role": "user", "content": "\nObjective: The Dolbina exacta is found on this sovereign island nation located in what ocean?\nObservation: [1560] RootWebArea 'Dolbina exacta' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dolbina_exacta\n\t[1684] textbox \"Search 'Wikipedia'\" required: False\n\t[1690] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1691] button '🏠'\n\t[1692] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1693] button 'Wikipedia'\n\t[1694] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1695] button '🎲'\n\t[1702] heading 'Dolbina exacta' hasPopup: menu\n\t[1440] StaticText 'Dolbina exacta, the exact grizzled hawkmoth, is a species of '\n\t[1710] link 'moth' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Moth\n\t[1445] StaticText ' of the family '\n\t[1711] link 'Sphingidae' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sphingidae\n\t[1714] table ''\n\t\t[1716] row ''\n\t\t\t[1717] columnheader 'Dolbina exacta' required: False\n\t\t[1719] row ''\n\t\t\t[1720] cell ''\n\t\t\t\t[1613] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Dolbina_exacta_MHNT_CUT_2010_0_486_Shibinai_Higashikaurat_Japan_male_dorsal.jpg.webp\n\t\t[1722] row ''\n\t\t\t[1723] cell 'Dolbina exacta ♂'\n\t\t[1725] row ''\n\t\t\t[1726] cell ''\n\t\t\t\t[1614] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Dolbina_exacta_MHNT_CUT_2010_0_486_Shibinai_Higashikaurat_Japan_male_ventral.jpg.webp\n\t\t[1728] row ''\n\t\t\t[1729] cell 'Dolbina exacta ♂ △'\n\t\t[1732] row ''\n\t\t\t[1733] columnheader 'Scientific classification' required: False\n\t\t\t\t[1734] link 'Scientific classification' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Taxonomy_(biology)\n\t\t\t\t[1612] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Red_Pencil_Icon.png.webp\n\t\t[1737] row ''\n\t\t\t[1738] cell 'Kingdom:'\n\t\t\t[1739] cell 'Animalia'\n\t\t\t\t[1740] link 'Animalia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Animal\n\t\t[1741] row ''\n\t\t\t[1742] cell 'Phylum:'\n\t\t\t[1743] cell 'Arthropoda'\n\t\t\t\t[1744] link 'Arthropoda' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Arthropod\n\t\t[1745] row ''\n\t\t\t[1746] cell 'Class:'\n\t\t\t[1747] cell 'Insecta'\n\t\t\t\t[1748] link 'Insecta' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Insect\n\t\t[1749] row ''\n\t\t\t[1750] cell 'Order:'\n\t\t\t[1751] cell 'Lepidoptera'\n\t\t\t\t[1752] link 'Lepidoptera' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lepidoptera\n\t\t[1753] row ''\n\t\t\t[1754] cell 'Family:'\n\t\t\t[1755] cell 'Sphingidae'\n\t\t\t\t[1756] link 'Sphingidae' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sphingidae\n\t\t[1757] row ''\n\t\t\t[1758] cell 'Genus:'\n\t\t\t[1759] cell 'Dolbina'\n\t\t\t\t[1760] link 'Dolbina' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dolbina\n\t[1792] group ''\n\t\t[1797] DisclosureTriangle 'Distribution' expanded: True\n\t\t\t[1798] heading 'Distribution'\n\t\t[1482] StaticText 'It is found from the southern part of the '\n\t\t[1800] link 'Russian Far East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Russian_Far_East\n\t\t[1801] link 'Japan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Japan\n\t\t[1486] StaticText ' and the '\n\t\t[1802] link 'Korean Peninsula' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Korean_Peninsula\n\t\t[1488] StaticText ', south into '\n\t\t[1803] link 'China' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/China\n\t\t[1490] StaticText ' as far as '\n\t\t[1804] link 'Sichuan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sichuan\n\t\t[1805] link 'Hubei' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Hubei\n\t\t[1494] StaticText ' and '\n\t\t[1806] link 'Zhejiang' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Zhejiang\n\t\t[1808] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dolbina_exacta#cite_note-2\n\t[1811] group ''\n\t\t[1816] DisclosureTriangle 'Description' expanded: True\n\t\t\t[1817] heading 'Description'\n\t\t[1819] link 'wingspan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wingspan\n\t\t[1502] StaticText ' is 55–58\\xa0mm. Both wings and the abdomen undersides are brownish grey. There are small black basal spots on the abdomen underside.'\n\t\t[1610] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Dolbina_exacta_MHNT_CUT_2010_0_486_Ussuri%252C_Gornotajosnoje%252C_Russia_female_dorsal.jpg.webp\n\t\t[1505] StaticText 'Dolbina exacta ♀'\n\t\t[1611] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Dolbina_exacta_MHNT_CUT_2010_0_486_Ussuri%252C_Gornotajosnoje%252C_Russia_female_ventral.jpg.webp\nHISTORY_ACTION: \ntype [328] [Dolbina exacta] [1]\nclick [1222] [Dolbina exacta]\nHISTORY_info: \n"}, {"role": "assistant", "content": "<think>We have confirmed from the distribution section that Dolbina exacta is found in Japan, which is a sovereign island nation. The remaining part of the question is to identify in which ocean Japan is located. To confirm this, I will click the link to the Japan article to check for the ocean information.</think>\n```click [1801] [Japan]```"}]}
12
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Sigrid Arnoldson is believed by critics to have been the successor to what famous opera singer who was often known as the \"Swedish Nightingale\"?\nObservation: [292] RootWebArea 'User:The other Kiwix guy/Landing' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing\n\t[327] textbox \"Search 'Wikipedia'\" required: False\n\t[333] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[334] button '🏠'\n\t[335] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[336] button 'Wikipedia'\n\t[337] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[338] button '🎲'\n\t[3] StaticText 'Welcome to '\n\t[349] link 'Wikipedia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wikipedia\n\t[5] StaticText 'The free encyclopedia.'\n\t[6] StaticText '6,489,052'\n\t[7] StaticText ' articles in '\n\t[355] link 'English' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/English_Wikipedia\n\t[357] heading 'Arts'\n\t[359] link 'Architecture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Architecture\n\t[360] link 'Books' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Books\n\t[361] link 'Cinematography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cinematography\n\t[362] link 'Dance' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dance\n\t[363] link 'Design' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Design\n\t[364] link 'Fashion' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fashion\n\t[365] link 'Films' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Films\n\t[366] link 'Gastronomy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gastronomy\n\t[367] link 'Literature' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Literature\n\t[368] link 'Magic (illusion)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Magic_(illusion)\n\t[369] link 'Music' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Music\n\t[370] link 'Painting' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Painting\n\t[371] link 'Photography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Photography\n\t[372] link 'Poetry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Poetry\n\t[373] link 'Sculpture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sculpture\n\t[374] link 'Theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Theatre\n\t[376] heading 'Geography'\n\t[378] link 'Africa' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Africa\n\t[379] link 'Antarctica' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Antarctica\n\t[380] link 'Arctic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Arctic\n\t[381] link 'Asia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Asia\n\t[382] link 'Caribbean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Caribbean\n\t[383] link 'Central America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Central_America\n\t[384] link 'Europe' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Europe\n\t[385] link 'Latin America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Latin_America\n\t[386] link 'Mediterranean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mediterranean\n\t[387] link 'Middle East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_East\n\t[388] link 'North America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/North_America\n\t[389] link 'Oceania' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Oceania\n\t[390] link 'South America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/South_America\n\t[391] link 'Cartography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cartography\n\t[393] heading 'History'\n\t[395] link 'Ancient Egypt' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Egypt\n\t[396] link 'Ancient Greece' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Greece\n\t[397] link 'Ancient Near East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Near_East\n\t[398] link 'Ancient Rome' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Rome\n\t[399] link 'Archaeology' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Archaeology\n\t[400] link 'British Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/British_Empire\n\t[401] link 'Byzantine Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Byzantine_Empire\n\t[402] link 'Colonialism' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Colonialism\n\t[403] link 'Crusades' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Crusades\n\t[404] link 'Heraldry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heraldry\n\t[405] link 'History of science' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_science\n\t[406] link 'Imperial China' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_China#Imperial_China\n\t[407] link 'Indian independence movement' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Indian_independence_movement\n\t[408] link 'Japan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_Japan\n\t[409] link 'Middle Ages' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_Ages\n\t[410] link 'Mughal Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mughal_Empire\n\t[411] link 'Ottoman Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ottoman_Empire\n\t[412] link 'Russian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Russian_Empire\n\t[413] link 'Sasanian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sasanian_Empire\n\t[414] link 'Seljuk Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Seljuk_Empire\n\t[415] link 'Soviet Union' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Soviet_Union\n\t[416] link 'War' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/War\n\t[418] heading 'Sciences'\nHISTORY_ACTION: \n\n"}, {"role": "assistant", "content": "<think>The user's objective is to find out which famous opera singer, known as the \"Swedish Nightingale,\" Sigrid Arnoldson was believed to be the successor to. The best approach is to use the search function on this Wikipedia instance. There is a search textbox with id [327] labeled \"Search 'Wikipedia'\". I will type \"Sigrid Arnoldson\" into the search box to find her article, which should contain the relevant information.</think>\n```type [327] [Sigrid Arnoldson] [1]```"}]}