hongzhuyi commited on
Commit
9747166
·
verified ·
1 Parent(s): a2a0e56

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. log/20250820-18:06:07.log +130 -0
  3. log/20250820-18:07:11.log +130 -0
  4. log/20250820-18:08:00.log +0 -0
  5. log/20250820-21:12:01.log +0 -0
  6. log/20250821-16:42:45.log +0 -0
  7. log/20250821-21:47:08.log +645 -0
  8. log/20250821-21:56:01.log +0 -0
  9. log/20250821-23:09:03.log +0 -0
  10. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/args.json +380 -0
  11. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/added_tokens.json +24 -0
  12. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/args.json +380 -0
  13. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/chat_template.jinja +54 -0
  14. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/config.json +60 -0
  15. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/generation_config.json +9 -0
  16. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/latest +1 -0
  17. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/merges.txt +0 -0
  18. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00001-of-00004.safetensors +3 -0
  19. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00002-of-00004.safetensors +3 -0
  20. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00003-of-00004.safetensors +3 -0
  21. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00004-of-00004.safetensors +3 -0
  22. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model.safetensors.index.json +347 -0
  23. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/scheduler.pt +3 -0
  24. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/special_tokens_map.json +31 -0
  25. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json +3 -0
  26. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer_config.json +207 -0
  27. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/trainer_state.json +1300 -0
  28. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/training_args.bin +3 -0
  29. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/vocab.json +0 -0
  30. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/zero_to_fp32.py +760 -0
  31. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_loss.png +0 -0
  32. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_runtime.png +0 -0
  33. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_samples_per_second.png +0 -0
  34. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_steps_per_second.png +0 -0
  35. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_token_acc.png +0 -0
  36. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_epoch.png +0 -0
  37. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_grad_norm.png +0 -0
  38. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_learning_rate.png +0 -0
  39. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_loss.png +0 -0
  40. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_token_acc.png +0 -0
  41. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_total_flos.png +0 -0
  42. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_loss.png +0 -0
  43. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_runtime.png +0 -0
  44. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_samples_per_second.png +0 -0
  45. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_steps_per_second.png +0 -0
  46. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/logging.jsonl +160 -0
  47. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/runs/events.out.tfevents.1755695667.TENCENT64.site.2602616.0 +3 -0
  48. qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/val_dataset.jsonl +9 -0
  49. qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/args.json +380 -0
  50. qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/added_tokens.json +24 -0
.gitattributes CHANGED
@@ -43,3 +43,5 @@ checkpoint-197/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
  qwen2.5-7b-1700q-661q-1e_6-old-click-1ep/checkpoint-497/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
  qwen2.5-7b-2225q-661q-1e_6-old-click-1ep/checkpoint-607/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
  qwen2.5-7b-rft-661q-all-1e_6-old-click-1ep/checkpoint-243/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
43
  qwen2.5-7b-1700q-661q-1e_6-old-click-1ep/checkpoint-497/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
  qwen2.5-7b-2225q-661q-1e_6-old-click-1ep/checkpoint-607/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
  qwen2.5-7b-rft-661q-all-1e_6-old-click-1ep/checkpoint-243/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/tokenizer.json filter=lfs diff=lfs merge=lfs -text
log/20250820-18:06:07.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Base --train_type full --dataset corr_nq_1500q_hotpot_1500q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 1 --per_device_train_batch_size 8 --learning_rate 1e-6 --gradient_accumulation_steps 2 --eval_steps 20 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
2
+
3
+ *****************************************
4
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
5
+ *****************************************
6
+ [INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
7
+ [INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
8
+ [ERROR:modelscope] Repo Qwen/Qwen2.5-7B-Base not exists on either https://www.modelscope.cn or https://www.modelscope.ai
9
+ Traceback (most recent call last):
10
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 773, in urlopen
11
+ self._prepare_proxy(conn)
12
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1042, in _prepare_proxy
13
+ conn.connect()
14
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 770, in connect
15
+ self._tunnel()
16
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 269, in _tunnel
17
+ raise OSError(
18
+ OSError: Tunnel connection failed: 403 Forbidden
19
+
20
+ The above exception was the direct cause of the following exception:
21
+
22
+ urllib3.exceptions.ProxyError: ('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden'))
23
+
24
+ The above exception was the direct cause of the following exception:
25
+
26
+ Traceback (most recent call last):
27
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
28
+ resp = conn.urlopen(
29
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
30
+ return self.urlopen(
31
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
32
+ return self.urlopen(
33
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
34
+ retries = retries.increment(
35
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
36
+ raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
37
+ urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
38
+
39
+ During handling of the above exception, another exception occurred:
40
+
41
+ Traceback (most recent call last):
42
+ File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
43
+ sft_main()
44
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
45
+ return SwiftSft(args).main()
46
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 27, in __init__
47
+ super().__init__(args)
48
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 19, in __init__
49
+ self.args = self._parse_args(args)
50
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 31, in _parse_args
51
+ args, remaining_argv = parse_args(self.args_class, args)
52
+ File "/group/40143/hongzhuyi/ms-swift/swift/utils/utils.py", line 152, in parse_args
53
+ args, remaining_args = parser.parse_args_into_dataclasses(argv, return_remaining_strings=True)
54
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
55
+ obj = dtype(**inputs)
56
+ File "<string>", line 323, in __init__
57
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/train_args.py", line 171, in __post_init__
58
+ BaseArguments.__post_init__(self)
59
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/base_args.py", line 171, in __post_init__
60
+ ModelArguments.__post_init__(self)
61
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 176, in __post_init__
62
+ self._init_torch_dtype()
63
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 90, in _init_torch_dtype
64
+ self.torch_dtype: torch.dtype = self._init_model_info()
65
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 145, in _init_model_info
66
+ self.model_info, self.model_meta = get_model_info_meta(**self.get_model_kwargs())
67
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/register.py", line 533, in get_model_info_meta
68
+ model_dir = safe_snapshot_download(
69
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/utils.py", line 302, in safe_snapshot_download
70
+ model_dir = hub.download_model(model_id_or_path, revision, ignore_patterns, token=hub_token, **kwargs)
71
+ File "/group/40143/hongzhuyi/ms-swift/swift/hub/hub.py", line 317, in download_model
72
+ return snapshot_download(model_id_or_path, revision, ignore_patterns=ignore_patterns, **kwargs)
73
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 132, in snapshot_download
74
+ return _snapshot_download(
75
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 298, in _snapshot_download
76
+ endpoint = _api.get_endpoint_for_read(
77
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 390, in get_endpoint_for_read
78
+ self.repo_exists(
79
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 446, in repo_exists
80
+ r = self.session.get(path, cookies=cookies,
81
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 602, in get
82
+ return self.request("GET", url, **kwargs)
83
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
84
+ resp = self.send(prep, **send_kwargs)
85
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
86
+ r = adapter.send(request, **kwargs)
87
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 694, in send
88
+ raise ProxyError(e, request=request)
89
+ requests.exceptions.ProxyError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
90
+ W0820 18:06:47.467000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577686 closing signal SIGTERM
91
+ W0820 18:06:47.467000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577687 closing signal SIGTERM
92
+ W0820 18:06:47.467000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577688 closing signal SIGTERM
93
+ W0820 18:06:47.469000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577689 closing signal SIGTERM
94
+ W0820 18:06:47.470000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577690 closing signal SIGTERM
95
+ W0820 18:06:47.472000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577691 closing signal SIGTERM
96
+ W0820 18:06:47.474000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577693 closing signal SIGTERM
97
+ E0820 18:06:47.570000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 6 (pid: 2577692) of binary: /data/miniforge/envs/ms-swift/bin/python3.10
98
+ Traceback (most recent call last):
99
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
100
+ return _run_code(code, main_globals, None,
101
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
102
+ exec(code, run_globals)
103
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
104
+ main()
105
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
106
+ return f(*args, **kwargs)
107
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
108
+ run(args)
109
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
110
+ elastic_launch(
111
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
112
+ return launch_agent(self._config, self._entrypoint, list(args))
113
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
114
+ raise ChildFailedError(
115
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
116
+ ============================================================
117
+ /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
118
+ ------------------------------------------------------------
119
+ Failures:
120
+ <NO_OTHER_FAILURES>
121
+ ------------------------------------------------------------
122
+ Root Cause (first observed failure):
123
+ [0]:
124
+ time : 2025-08-20_18:06:47
125
+ host : TENCENT64.site
126
+ rank : 6 (local_rank: 6)
127
+ exitcode : 1 (pid: 2577692)
128
+ error_file: <N/A>
129
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
130
+ ============================================================
log/20250820-18:07:11.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Base --train_type full --dataset corr_nq_1500q_hotpot_1500q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 1 --per_device_train_batch_size 8 --learning_rate 1e-6 --gradient_accumulation_steps 2 --eval_steps 20 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
2
+
3
+ *****************************************
4
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
5
+ *****************************************
6
+ [INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
7
+ [INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
8
+ [ERROR:modelscope] Repo Qwen/Qwen2.5-7B-Base not exists on either https://www.modelscope.cn or https://www.modelscope.ai
9
+ Traceback (most recent call last):
10
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 773, in urlopen
11
+ self._prepare_proxy(conn)
12
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1042, in _prepare_proxy
13
+ conn.connect()
14
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 770, in connect
15
+ self._tunnel()
16
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 269, in _tunnel
17
+ raise OSError(
18
+ OSError: Tunnel connection failed: 403 Forbidden
19
+
20
+ The above exception was the direct cause of the following exception:
21
+
22
+ urllib3.exceptions.ProxyError: ('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden'))
23
+
24
+ The above exception was the direct cause of the following exception:
25
+
26
+ Traceback (most recent call last):
27
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
28
+ resp = conn.urlopen(
29
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
30
+ return self.urlopen(
31
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
32
+ return self.urlopen(
33
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
34
+ retries = retries.increment(
35
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
36
+ raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
37
+ urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
38
+
39
+ During handling of the above exception, another exception occurred:
40
+
41
+ Traceback (most recent call last):
42
+ File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
43
+ sft_main()
44
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
45
+ return SwiftSft(args).main()
46
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 27, in __init__
47
+ super().__init__(args)
48
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 19, in __init__
49
+ self.args = self._parse_args(args)
50
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 31, in _parse_args
51
+ args, remaining_argv = parse_args(self.args_class, args)
52
+ File "/group/40143/hongzhuyi/ms-swift/swift/utils/utils.py", line 152, in parse_args
53
+ args, remaining_args = parser.parse_args_into_dataclasses(argv, return_remaining_strings=True)
54
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
55
+ obj = dtype(**inputs)
56
+ File "<string>", line 323, in __init__
57
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/train_args.py", line 171, in __post_init__
58
+ BaseArguments.__post_init__(self)
59
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/base_args.py", line 171, in __post_init__
60
+ ModelArguments.__post_init__(self)
61
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 176, in __post_init__
62
+ self._init_torch_dtype()
63
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 90, in _init_torch_dtype
64
+ self.torch_dtype: torch.dtype = self._init_model_info()
65
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 145, in _init_model_info
66
+ self.model_info, self.model_meta = get_model_info_meta(**self.get_model_kwargs())
67
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/register.py", line 533, in get_model_info_meta
68
+ model_dir = safe_snapshot_download(
69
+ File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/utils.py", line 302, in safe_snapshot_download
70
+ model_dir = hub.download_model(model_id_or_path, revision, ignore_patterns, token=hub_token, **kwargs)
71
+ File "/group/40143/hongzhuyi/ms-swift/swift/hub/hub.py", line 317, in download_model
72
+ return snapshot_download(model_id_or_path, revision, ignore_patterns=ignore_patterns, **kwargs)
73
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 132, in snapshot_download
74
+ return _snapshot_download(
75
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 298, in _snapshot_download
76
+ endpoint = _api.get_endpoint_for_read(
77
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 390, in get_endpoint_for_read
78
+ self.repo_exists(
79
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 446, in repo_exists
80
+ r = self.session.get(path, cookies=cookies,
81
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 602, in get
82
+ return self.request("GET", url, **kwargs)
83
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
84
+ resp = self.send(prep, **send_kwargs)
85
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
86
+ r = adapter.send(request, **kwargs)
87
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 694, in send
88
+ raise ProxyError(e, request=request)
89
+ requests.exceptions.ProxyError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
90
+ W0820 18:07:24.029000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580150 closing signal SIGTERM
91
+ W0820 18:07:24.029000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580151 closing signal SIGTERM
92
+ W0820 18:07:24.029000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580153 closing signal SIGTERM
93
+ W0820 18:07:24.031000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580154 closing signal SIGTERM
94
+ W0820 18:07:24.031000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580155 closing signal SIGTERM
95
+ W0820 18:07:24.031000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580156 closing signal SIGTERM
96
+ W0820 18:07:24.032000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580157 closing signal SIGTERM
97
+ E0820 18:07:24.129000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 2 (pid: 2580152) of binary: /data/miniforge/envs/ms-swift/bin/python3.10
98
+ Traceback (most recent call last):
99
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
100
+ return _run_code(code, main_globals, None,
101
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
102
+ exec(code, run_globals)
103
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
104
+ main()
105
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
106
+ return f(*args, **kwargs)
107
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
108
+ run(args)
109
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
110
+ elastic_launch(
111
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
112
+ return launch_agent(self._config, self._entrypoint, list(args))
113
+ File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
114
+ raise ChildFailedError(
115
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
116
+ ============================================================
117
+ /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
118
+ ------------------------------------------------------------
119
+ Failures:
120
+ <NO_OTHER_FAILURES>
121
+ ------------------------------------------------------------
122
+ Root Cause (first observed failure):
123
+ [0]:
124
+ time : 2025-08-20_18:07:24
125
+ host : TENCENT64.site
126
+ rank : 2 (local_rank: 2)
127
+ exitcode : 1 (pid: 2580152)
128
+ error_file: <N/A>
129
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
130
+ ============================================================
log/20250820-18:08:00.log ADDED
The diff for this file is too large to render. See raw diff
 
log/20250820-21:12:01.log ADDED
The diff for this file is too large to render. See raw diff
 
log/20250821-16:42:45.log ADDED
The diff for this file is too large to render. See raw diff
 
log/20250821-21:47:08.log ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
 
 
1
 
 
 
1
+ run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B --train_type full --dataset /group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 1 --per_device_train_batch_size 4 --learning_rate 1e-6 --gradient_accumulation_steps 2 --eval_steps 20 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
2
+
3
+ *****************************************
4
+ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
5
+ *****************************************
6
+ [INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
7
+ [INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
8
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
9
+ [2025-08-21 21:47:33,244] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
10
+ [2025-08-21 21:47:34,608] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
11
+ [2025-08-21 21:47:34,617] [INFO] [comm.py:821:init_distributed] cdb=None
12
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
13
+ [2025-08-21 21:47:38,267] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
14
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
15
+ [2025-08-21 21:47:39,606] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
16
+ [2025-08-21 21:47:39,615] [INFO] [comm.py:821:init_distributed] cdb=None
17
+ [2025-08-21 21:47:42,236] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
18
+ [2025-08-21 21:47:43,646] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
19
+ [2025-08-21 21:47:43,655] [INFO] [comm.py:821:init_distributed] cdb=None
20
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
21
+ [2025-08-21 21:47:48,256] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
22
+ [2025-08-21 21:47:49,600] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
23
+ [2025-08-21 21:47:49,609] [INFO] [comm.py:821:init_distributed] cdb=None
24
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
25
+ [2025-08-21 21:47:54,178] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
26
+ [2025-08-21 21:47:55,585] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
27
+ [2025-08-21 21:47:55,593] [INFO] [comm.py:821:init_distributed] cdb=None
28
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
29
+ [INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B
30
+ [2025-08-21 21:47:58,145] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
31
+ [2025-08-21 21:47:59,817] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
32
+ [2025-08-21 21:47:59,826] [INFO] [comm.py:821:init_distributed] cdb=None
33
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
34
+ [INFO:modelscope] Target directory already exists, skipping creation.
35
+ [INFO:swift] Loading the model using model_dir: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B
36
+ [INFO:swift] Setting args.lazy_tokenize: False
37
+ [INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
38
+ [2025-08-21 21:48:36,241] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
39
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
40
+ [2025-08-21 21:48:37,911] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
41
+ [2025-08-21 21:48:37,921] [INFO] [comm.py:821:init_distributed] cdb=None
42
+ [2025-08-21 21:48:37,921] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
43
+ [2025-08-21 21:48:41,189] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
44
+ [2025-08-21 21:48:42,841] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
45
+ [2025-08-21 21:48:42,850] [INFO] [comm.py:821:init_distributed] cdb=None
46
+ [INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846
47
+ [INFO:swift] Global seed set to 42
48
+ [INFO:swift] args: TrainArguments(
49
+ _n_gpu=-1,
50
+ acc_strategy=token,
51
+ accelerator_config={'dispatch_batches': False},
52
+ adafactor=False,
53
+ adalora_beta1=0.85,
54
+ adalora_beta2=0.85,
55
+ adalora_deltaT=1,
56
+ adalora_init_r=12,
57
+ adalora_orth_reg_weight=0.5,
58
+ adalora_target_r=8,
59
+ adalora_tfinal=0,
60
+ adalora_tinit=0,
61
+ adam_beta1=0.9,
62
+ adam_beta2=0.95,
63
+ adam_epsilon=1e-08,
64
+ adapter_act=gelu,
65
+ adapter_length=128,
66
+ adapters=[],
67
+ add_version=True,
68
+ agent_template=None,
69
+ aligner_lr=None,
70
+ attn_impl=None,
71
+ auto_find_batch_size=False,
72
+ average_tokens_across_devices=True,
73
+ batch_eval_metrics=False,
74
+ bf16=True,
75
+ bf16_full_eval=False,
76
+ bnb_4bit_compute_dtype=torch.bfloat16,
77
+ bnb_4bit_quant_storage=None,
78
+ bnb_4bit_quant_type=nf4,
79
+ bnb_4bit_use_double_quant=True,
80
+ boft_block_num=0,
81
+ boft_block_size=4,
82
+ boft_dropout=0.0,
83
+ boft_n_butterfly_factor=1,
84
+ cached_dataset=[],
85
+ channels=None,
86
+ check_model=True,
87
+ ckpt_dir=None,
88
+ columns={},
89
+ create_checkpoint_symlink=False,
90
+ custom_dataset_info=[],
91
+ custom_register_path=[],
92
+ data_seed=42,
93
+ dataloader_drop_last=False,
94
+ dataloader_num_workers=48,
95
+ dataloader_persistent_workers=False,
96
+ dataloader_pin_memory=True,
97
+ dataloader_prefetch_factor=None,
98
+ dataset=['/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_swift.jsonl'],
99
+ dataset_num_proc=100,
100
+ dataset_shuffle=True,
101
+ ddp_backend=None,
102
+ ddp_broadcast_buffers=None,
103
+ ddp_bucket_cap_mb=None,
104
+ ddp_find_unused_parameters=None,
105
+ ddp_timeout=18000000,
106
+ debug=None,
107
+ deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
108
+ deepspeed_autotp_size=None,
109
+ device_map=None,
110
+ disable_tqdm=None,
111
+ do_eval=False,
112
+ do_predict=False,
113
+ do_train=False,
114
+ download_mode=reuse_dataset_if_exists,
115
+ ds3_gather_for_generation=True,
116
+ early_stop_interval=None,
117
+ enable_dft_loss=False,
118
+ eval_accumulation_steps=None,
119
+ eval_dataset=[],
120
+ eval_dataset_args=None,
121
+ eval_delay=0,
122
+ eval_do_concat_batches=True,
123
+ eval_generation_config=None,
124
+ eval_limit=None,
125
+ eval_on_start=False,
126
+ eval_steps=20.0,
127
+ eval_strategy=epoch,
128
+ eval_use_evalscope=False,
129
+ eval_use_gather_object=False,
130
+ external_plugins=[],
131
+ extra_eval_args=None,
132
+ fourier_n_frequency=2000,
133
+ fourier_scaling=300.0,
134
+ fp16=False,
135
+ fp16_backend=auto,
136
+ fp16_full_eval=False,
137
+ fp16_opt_level=O1,
138
+ freeze_aligner=False,
139
+ freeze_llm=False,
140
+ freeze_parameters=[],
141
+ freeze_parameters_ratio=0.0,
142
+ freeze_parameters_regex=None,
143
+ freeze_vit=True,
144
+ fsdp=,
145
+ fsdp_config=None,
146
+ fsdp_min_num_params=0,
147
+ fsdp_transformer_layer_cls_to_wrap=None,
148
+ full_determinism=False,
149
+ galore_cos_threshold=0.4,
150
+ galore_gamma_proj=2,
151
+ galore_optim_per_parameter=False,
152
+ galore_proj_bits=4,
153
+ galore_proj_group_size=256,
154
+ galore_proj_quant=False,
155
+ galore_proj_type=std,
156
+ galore_quantization=False,
157
+ galore_queue_size=5,
158
+ galore_rank=128,
159
+ galore_scale=1.0,
160
+ galore_target_modules=None,
161
+ galore_update_proj_gap=50,
162
+ galore_with_embedding=False,
163
+ generation_config=None,
164
+ generation_max_length=None,
165
+ generation_num_beams=None,
166
+ gradient_accumulation_steps=2,
167
+ gradient_checkpointing=True,
168
+ gradient_checkpointing_kwargs=None,
169
+ greater_is_better=False,
170
+ group_by_length=False,
171
+ half_precision_backend=auto,
172
+ hqq_axis=None,
173
+ hub_always_push=False,
174
+ hub_model_id=None,
175
+ hub_private_repo=None,
176
+ hub_revision=None,
177
+ hub_strategy=every_save,
178
+ hub_token=<HUB_TOKEN>,
179
+ ignore_args_error=False,
180
+ ignore_data_skip=False,
181
+ include_for_metrics=[],
182
+ include_inputs_for_metrics=False,
183
+ include_num_input_tokens_seen=False,
184
+ include_tokens_per_second=False,
185
+ init_strategy=None,
186
+ init_weights=True,
187
+ interleave_prob=None,
188
+ jit_mode_eval=False,
189
+ label_names=None,
190
+ label_smoothing_factor=0.0,
191
+ lazy_tokenize=False,
192
+ learning_rate=1e-06,
193
+ length_column_name=length,
194
+ liger_kernel_config=None,
195
+ lisa_activated_layers=0,
196
+ lisa_step_interval=20,
197
+ llamapro_num_groups=None,
198
+ llamapro_num_new_blocks=4,
199
+ load_args=False,
200
+ load_best_model_at_end=False,
201
+ load_data_args=False,
202
+ load_from_cache_file=True,
203
+ local_rank=0,
204
+ local_repo_path=None,
205
+ log_level=passive,
206
+ log_level_replica=warning,
207
+ log_on_each_node=True,
208
+ logging_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/runs,
209
+ logging_first_step=True,
210
+ logging_nan_inf_filter=True,
211
+ logging_steps=1,
212
+ logging_strategy=steps,
213
+ logprobs=False,
214
+ lora_alpha=32,
215
+ lora_bias=none,
216
+ lora_dropout=0.05,
217
+ lora_dtype=None,
218
+ lora_ga_batch_size=2,
219
+ lora_ga_direction=ArB2r,
220
+ lora_ga_iters=2,
221
+ lora_ga_max_length=1024,
222
+ lora_ga_scale=stable,
223
+ lora_ga_stable_gamma=16,
224
+ lora_modules=[],
225
+ lora_rank=8,
226
+ lorap_lr_ratio=None,
227
+ loss_scale=default,
228
+ loss_type=None,
229
+ lr_scheduler_kwargs=None,
230
+ lr_scheduler_type=cosine,
231
+ max_epochs=None,
232
+ max_grad_norm=1.0,
233
+ max_length=16240,
234
+ max_memory={},
235
+ max_model_len=None,
236
+ max_new_tokens=64,
237
+ max_pixels=None,
238
+ max_steps=-1,
239
+ metric=None,
240
+ metric_for_best_model=loss,
241
+ model=Qwen/Qwen2.5-7B,
242
+ model_author=None,
243
+ model_kwargs={},
244
+ model_name=None,
245
+ model_revision=None,
246
+ model_type=qwen2_5,
247
+ modules_to_save=[],
248
+ mp_parameters=,
249
+ neftune_noise_alpha=None,
250
+ new_special_tokens=[],
251
+ no_cuda=False,
252
+ norm_bbox=None,
253
+ num_beams=1,
254
+ num_labels=None,
255
+ num_train_epochs=1.0,
256
+ optim=adamw_torch_fused,
257
+ optim_args=None,
258
+ optim_target_modules=None,
259
+ optimizer=None,
260
+ output_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846,
261
+ overwrite_output_dir=False,
262
+ packing=False,
263
+ packing_length=None,
264
+ padding_free=False,
265
+ padding_side=right,
266
+ past_index=-1,
267
+ per_device_eval_batch_size=1,
268
+ per_device_train_batch_size=4,
269
+ predict_with_generate=False,
270
+ prediction_loss_only=False,
271
+ problem_type=None,
272
+ push_to_hub=False,
273
+ push_to_hub_model_id=None,
274
+ push_to_hub_organization=None,
275
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
276
+ quant_bits=None,
277
+ quant_method=None,
278
+ ray_scope=last,
279
+ reft_args=None,
280
+ reft_intervention_type=LoreftIntervention,
281
+ reft_layer_key=None,
282
+ reft_layers=None,
283
+ reft_rank=4,
284
+ remove_unused_columns=True,
285
+ repetition_penalty=None,
286
+ report_to=['tensorboard'],
287
+ response_prefix=None,
288
+ restore_callback_states_from_checkpoint=False,
289
+ resume_from_checkpoint=None,
290
+ resume_only_model=False,
291
+ rope_scaling=None,
292
+ router_aux_loss_coef=0.0,
293
+ run_name=/group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846,
294
+ save_on_each_node=False,
295
+ save_only_model=False,
296
+ save_safetensors=True,
297
+ save_steps=500,
298
+ save_strategy=epoch,
299
+ save_total_limit=None,
300
+ seed=42,
301
+ sequence_parallel_size=1,
302
+ shuffle_buffer_size=1000,
303
+ skip_memory_metrics=True,
304
+ sortish_sampler=False,
305
+ split_dataset_ratio=0.001,
306
+ stop_words=[],
307
+ stopping_strategy=first_exhausted,
308
+ stream=False,
309
+ streaming=False,
310
+ strict=False,
311
+ swanlab_exp_name=None,
312
+ swanlab_lark_secret=None,
313
+ swanlab_lark_webhook_url=None,
314
+ swanlab_mode=cloud,
315
+ swanlab_project=None,
316
+ swanlab_token=<SWANLAB_TOKEN>,
317
+ swanlab_workspace=None,
318
+ system=None,
319
+ target_modules=['all-linear'],
320
+ target_regex=None,
321
+ task_type=causal_lm,
322
+ temperature=0.0,
323
+ template=qwen2_5,
324
+ template_backend=swift,
325
+ tf32=None,
326
+ top_k=None,
327
+ top_logprobs=None,
328
+ top_p=None,
329
+ torch_compile=False,
330
+ torch_compile_backend=None,
331
+ torch_compile_mode=None,
332
+ torch_dtype=torch.bfloat16,
333
+ torch_empty_cache_steps=None,
334
+ torchdynamo=None,
335
+ tpu_metrics_debug=False,
336
+ tpu_num_cores=None,
337
+ train_dataloader_shuffle=True,
338
+ train_type=full,
339
+ trainable_parameters=[],
340
+ trainable_parameters_regex=None,
341
+ truncation_strategy=delete,
342
+ tuner_backend=peft,
343
+ use_chat_template=True,
344
+ use_cpu=False,
345
+ use_dora=False,
346
+ use_flash_ckpt=False,
347
+ use_galore=False,
348
+ use_hf=False,
349
+ use_ipex=False,
350
+ use_legacy_prediction_loop=False,
351
+ use_liger_kernel=False,
352
+ use_logits_to_keep=None,
353
+ use_mps_device=False,
354
+ use_rslora=False,
355
+ use_swift_lora=False,
356
+ val_dataset=[],
357
+ val_dataset_shuffle=False,
358
+ vera_d_initial=0.1,
359
+ vera_dropout=0.0,
360
+ vera_projection_prng_key=0,
361
+ vera_rank=256,
362
+ vit_gradient_checkpointing=None,
363
+ vit_lr=None,
364
+ warmup_ratio=0.05,
365
+ warmup_steps=0,
366
+ weight_decay=0.1,
367
+ zero_hpz_partition_size=None,
368
+ )
369
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
370
+ [2025-08-21 21:48:52,405] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
371
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
372
+ [2025-08-21 21:49:13,760] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
373
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
374
+ [INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B
375
+ [2025-08-21 21:49:17,176] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
376
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
377
+ [INFO:modelscope] Target directory already exists, skipping creation.
378
+ [INFO:swift] Loading the model using model_dir: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B
379
+ [INFO:swift] model_kwargs: {'device_map': None}
380
+ [2025-08-21 21:49:23,592] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
381
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
382
+ [2025-08-21 21:49:28,463] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
383
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
384
+ [2025-08-21 21:49:47,786] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
385
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
386
+ [2025-08-21 21:50:01,618] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
387
+ Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
388
+ [2025-08-21 21:50:05,699] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
389
+ [2025-08-21 21:50:06,870] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+ [INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
399
+ "architectures": [
400
+ "Qwen2ForCausalLM"
401
+ ],
402
+ "attention_dropout": 0.0,
403
+ "bos_token_id": 151643,
404
+ "eos_token_id": 151643,
405
+ "hidden_act": "silu",
406
+ "hidden_size": 3584,
407
+ "initializer_range": 0.02,
408
+ "intermediate_size": 18944,
409
+ "layer_types": [
410
+ "full_attention",
411
+ "full_attention",
412
+ "full_attention",
413
+ "full_attention",
414
+ "full_attention",
415
+ "full_attention",
416
+ "full_attention",
417
+ "full_attention",
418
+ "full_attention",
419
+ "full_attention",
420
+ "full_attention",
421
+ "full_attention",
422
+ "full_attention",
423
+ "full_attention",
424
+ "full_attention",
425
+ "full_attention",
426
+ "full_attention",
427
+ "full_attention",
428
+ "full_attention",
429
+ "full_attention",
430
+ "full_attention",
431
+ "full_attention",
432
+ "full_attention",
433
+ "full_attention",
434
+ "full_attention",
435
+ "full_attention",
436
+ "full_attention",
437
+ "full_attention"
438
+ ],
439
+ "max_position_embeddings": 131072,
440
+ "max_window_layers": 28,
441
+ "model_type": "qwen2",
442
+ "num_attention_heads": 28,
443
+ "num_hidden_layers": 28,
444
+ "num_key_value_heads": 4,
445
+ "pad_token_id": 151643,
446
+ "rms_norm_eps": 1e-06,
447
+ "rope_scaling": null,
448
+ "rope_theta": 1000000.0,
449
+ "sliding_window": null,
450
+ "tie_word_embeddings": false,
451
+ "torch_dtype": "bfloat16",
452
+ "transformers_version": "4.55.2",
453
+ "use_cache": true,
454
+ "use_mrope": false,
455
+ "use_sliding_window": false,
456
+ "vocab_size": 152064
457
+ }
458
+ , task_type='causal_lm', num_labels=None)
459
+ [INFO:swift] model.generation_config: GenerationConfig {
460
+ "bos_token_id": 151643,
461
+ "eos_token_id": 151643,
462
+ "max_new_tokens": 64,
463
+ "pad_token_id": 151643
464
+ }
465
+
466
+ [INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
467
+ [INFO:swift] max_length: 16240
468
+ [INFO:swift] response_prefix: ''
469
+ [INFO:swift] agent_template: hermes
470
+ [INFO:swift] Start time of running main: 2025-08-21 21:50:16.860705
471
+ [INFO:swift] swift.__version__: 3.8.0.dev0
472
+ Setting num_proc from 100 back to 1 for the train split to disable multiprocessing as it only contains one shard.
473
+
474
+
475
+ [INFO:swift] train_dataset: Dataset({
476
+ features: ['messages'],
477
+ num_rows: 5973
478
+ })
479
+ [INFO:swift] val_dataset: Dataset({
480
+ features: ['messages'],
481
+ num_rows: 5
482
+ })
483
+ [INFO:swift] The split dataset from the training set will be saved at: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/val_dataset.jsonl.
484
+
485
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
486
+
487
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
488
+ [INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 264, 6929, 16230, 17847, 6188, 311, 9026, 3019, 14319, 29208, 6929, 7525, 29720, 323, 23638, 311, 4583, 279, 1196, 594, 3383, 13, 1446, 525, 3897, 448, 3151, 9079, 323, 44610, 13904, 1995, 11, 323, 498, 1184, 311, 2550, 13382, 6168, 311, 22054, 279, 1196, 594, 3383, 382, 8420, 594, 279, 1995, 498, 3278, 614, 510, 785, 1196, 594, 16538, 25, 1096, 374, 279, 3383, 498, 2299, 4460, 311, 4583, 624, 785, 1482, 3482, 2150, 594, 39700, 4916, 25, 1096, 374, 264, 43799, 13042, 315, 279, 44610, 11, 8241, 1376, 1995, 624, 785, 1787, 22398, 25, 4220, 525, 279, 22398, 498, 614, 1787, 624, 785, 3681, 6168, 25, 2619, 525, 279, 6168, 498, 1101, 10660, 13, 1084, 1231, 387, 10950, 311, 3754, 697, 5098, 624, 14873, 2669, 1730, 25, 8085, 5435, 311, 279, 1482, 3239, 429, 702, 1012, 10820, 304, 13656, 6168, 13, 1446, 1184, 311, 31072, 323, 22047, 419, 1995, 382, 785, 6168, 498, 646, 2736, 4399, 1119, 3807, 11059, 1447, 2665, 16730, 26722, 510, 63, 3678, 508, 307, 60, 508, 1796, 60, 44622, 1096, 1917, 27749, 389, 458, 2392, 448, 264, 3151, 877, 389, 279, 44610, 624, 63, 1313, 508, 307, 60, 508, 1796, 60, 508, 1873, 37480, 19844, 28, 15, 91, 16, 60, 44622, 5443, 419, 311, 943, 279, 2213, 1119, 279, 2070, 448, 877, 13, 3216, 1638, 11, 279, 1591, 6269, 3014, 1376, 374, 17320, 1283, 19496, 7241, 3493, 37480, 19844, 374, 738, 311, 220, 15, 624, 63, 17583, 508, 307, 60, 508, 1796, 60, 44622, 85569, 916, 458, 2392, 448, 877, 624, 63, 1873, 508, 792, 34454, 60, 44622, 220, 4467, 23156, 279, 25352, 315, 264, 1376, 10601, 389, 279, 13625, 320, 68, 1302, 2572, 37014, 98267, 4292, 63, 12605, 508, 2923, 91, 454, 60, 44622, 22392, 279, 2150, 705, 476, 1495, 382, 8582, 9551, 26722, 510, 63, 931, 17344, 44622, 5264, 264, 501, 11, 4287, 6929, 5651, 624, 63, 6192, 47492, 508, 6192, 3560, 60, 44622, 15586, 279, 6929, 594, 5244, 311, 264, 3151, 5651, 1667, 1181, 1922, 624, 63, 5552, 17344, 44622, 13032, 279, 5023, 4541, 5651, 382, 3144, 17980, 26722, 510, 63, 28535, 508, 1085, 60, 44622, 81739, 311, 264, 3151, 5548, 624, 63, 3346, 3895, 44622, 81739, 311, 279, 8597, 19334, 2150, 624, 63, 3346, 32121, 44622, 81739, 311, 279, 1790, 2150, 320, 333, 264, 3681, 364, 3346, 3895, 6, 1917, 572, 10660, 3593, 33190, 5586, 510, 63, 9495, 508, 9217, 60, 44622, 25226, 419, 1917, 979, 498, 4411, 279, 3383, 374, 4583, 13, 1416, 279, 16538, 374, 311, 1477, 264, 1467, 5980, 4226, 11, 3410, 279, 4226, 304, 279, 31642, 13, 1416, 498, 4411, 279, 3383, 374, 11997, 311, 4583, 11, 3410, 279, 4226, 438, 1591, 45, 10360, 3014, 304, 279, 31642, 382, 1249, 387, 6849, 11, 432, 374, 1602, 2989, 311, 1795, 279, 2701, 5601, 510, 16, 13, 1446, 1265, 1172, 4265, 458, 1917, 429, 374, 2697, 2661, 279, 1482, 21930, 624, 17, 13, 1446, 1265, 1172, 4265, 825, 1917, 518, 264, 882, 624, 18, 13, 1446, 1265, 1795, 279, 10295, 311, 2874, 3019, 553, 3019, 323, 1221, 4265, 279, 1790, 1917, 624, 19, 13, 1446, 1265, 8300, 311, 13656, 6168, 979, 4265, 458, 1917, 323, 1430, 537, 311, 1281, 58077, 6168, 198, 20, 13, 2009, 32711, 1969, 387, 4766, 30586, 26865, 1472, 26865, 43626, 9492, 11, 323, 1052, 1969, 387, 902, 2550, 1573, 30586, 26865, 1472, 26865, 29, 18639, 21, 13, 4636, 30586, 26865, 1472, 26865, 29, 7808, 1172, 279, 1917, 1265, 387, 7907, 304, 279, 4396, 3561, 11, 43810, 304, 2038, 69155, 13, 1752, 3110, 510, 256, 366, 26865, 41993, 3137, 5868, 9760, 311, 847, 5795, 13, 9189, 287, 432, 1265, 1896, 752, 311, 279, 1790, 3019, 3918, 26865, 397, 256, 54275, 3678, 508, 307, 60, 508, 1796, 60, 13874, 3989, 22, 13, 25226, 279, 2936, 1917, 979, 498, 1744, 498, 614, 16994, 279, 16538, 13, 4320, 1405, 6923, 4113, 1283, 2936, 624, 23, 13, 23240, 3561, 6168, 12440, 25, 715, 73594, 5631, 508, 13786, 60, 13874, 3989, 2461, 3110, 11, 421, 15039, 369, 1591, 61907, 2802, 41612, 304, 279, 2274, 3014, 304, 264, 2711, 2070, 448, 3034, 1565, 17, 16, 7808, 12440, 3561, 432, 438, 510, 73594, 1313, 508, 17, 16, 60, 508, 61907, 2802, 41612, 304, 279, 2274, 60, 508, 16, 60, 13874, 3989, 52116, 15114, 19856, 429, 51044, 38929, 2163, 5029, 476, 24064, 2750, 624, 24, 1785, 10053, 366, 26865, 1472, 26865, 8066, 498, 1184, 311, 990, 366, 443, 8957, 1472, 443, 8957, 29, 311, 3209, 1469, 279, 1995, 12180, 304, 419, 4778, 429, 374, 9760, 311, 279, 1482, 3239, 13, 7036, 429, 421, 1052, 374, 902, 2697, 1995, 11, 419, 949, 374, 537, 2567, 13, 576, 43810, 1995, 1969, 387, 5961, 39930, 311, 4226, 279, 4024, 3239, 13, 151645, 198, 151644, 872, 271, 78306, 25, 8713, 356, 2328, 10311, 685, 323, 393, 457, 64, 304, 279, 1852, 2997, 5267, 37763, 367, 25, 508, 17, 20, 18, 24, 60, 18854, 5981, 8726, 364, 5890, 25, 393, 457, 64, 6, 10735, 25, 3007, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 23167, 30, 1796, 63417, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 5, 14339, 76778, 457, 64, 198, 197, 58, 17, 20, 21, 15, 60, 73999, 330, 5890, 364, 54, 14939, 14913, 2567, 25, 3557, 198, 197, 58, 17, 20, 21, 21, 60, 2656, 364, 10850, 311, 10565, 2150, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 5894, 197, 197, 58, 17, 20, 21, 22, 60, 3137, 364, 145574, 1248, 197, 58, 17, 20, 21, 23, 60, 2656, 330, 10850, 311, 279, 1887, 2150, 315, 364, 54, 14939, 14913, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 5894, 197, 197, 58, 17, 20, 21, 24, 60, 3137, 364, 54, 14939, 1248, 197, 58, 17, 20, 22, 15, 60, 2656, 364, 10850, 311, 264, 26618, 4091, 2150, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 79442, 30, 1796, 63417, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 198, 197, 197, 58, 17, 20, 22, 16, 60, 3137, 364, 147724, 1248, 197, 58, 17, 16, 17, 18, 60, 23105, 1178, 364, 9801, 220, 16, 12, 17, 20, 315, 220, 16, 18, 20, 369, 330, 47, 457, 64, 87628, 197, 58, 17, 20, 22, 24, 60, 2656, 364, 852, 315, 393, 457, 64, 9419, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 76397, 3575, 1088, 457, 64, 74130, 198, 197, 58, 17, 16, 18, 15, 60, 23105, 1178, 364, 1112, 4106, 75, 13, 393, 457, 64, 75993, 355, 333, 38160, 356, 3503, 6353, 268, 393, 457, 64, 3619, 1466, 263, 86678, 71213, 13, 393, 457, 64, 3619, 65, 333, 75, 6215, 71213, 13, 609, 48988, 1515, 393, 457, 64, 6770, 539, 459, 467, 836, 1175, 524, 393, 457, 64, 21529, 1536, 12417, 26835, 573, 393, 457, 64, 293, 4110, 3808, 425, 1214, 2013, 674, 13, 393, 457, 64, 2148, 74621, 354, 3808, 356, 1704, 347, 13, 393, 457, 64, 31915, 277, 344, 86678, 1967, 437, 461, 393, 457, 64, 293, 826, 13807, 86678, 356, 1785, 2013, 674, 13, 393, 457, 64, 1274, 360, 333, 38160, 320, 13218, 6138, 6521, 67, 13, 393, 457, 64, 59731, 789, 71213, 13, 393, 457, 64, 14840, 747, 6053, 472, 7758, 3503, 3103, 393, 457, 64, 14840, 72768, 2825, 661, 89, 13, 393, 457, 64, 708, 2636, 1174, 72, 588, 288, 6521, 67, 13, 393, 457, 64, 708, 344, 258, 12417, 6521, 67, 13, 393, 457, 64, 7814, 12417, 479, 3246, 747, 13, 393, 457, 64, 31632, 5970, 3001, 2876, 2782, 393, 457, 64, 10924, 465, 3808, 26835, 573, 393, 457, 64, 1411, 610, 1082, 10524, 71213, 13, 393, 457, 64, 1411, 531, 68, 11983, 6521, 67, 13, 393, 457, 64, 28149, 1248, 197, 58, 17, 16, 22, 18, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 16, 22, 19, 60, 23105, 1178, 364, 17, 11, 16, 23, 22, 4244, 1248, 197, 58, 17, 21, 15, 20, 60, 2656, 364, 47, 457, 64, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 198, 197, 58, 17, 16, 22, 21, 60, 23105, 1178, 364, 47, 457, 64, 393, 457, 64, 11, 448, 220, 21, 15, 15, 4142, 22, 16, 20, 9419, 11, 374, 279, 7772, 63577, 315, 81239, 10779, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 11, 323, 825, 315, 279, 8131, 83435, 304, 279, 547, 3342, 3001, 642, 13, 393, 457, 64, 393, 457, 64, 5749, 1241, 258, 1754, 5607, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 62704, 25, 3984, 266, 535, 336, 349, 5918, 9316, 355, 25, 393, 457, 64, 27219, 75, 2572, 220, 16, 23, 17, 16, 50467, 3496, 1467, 1084, 374, 4237, 6814, 279, 21267, 1211, 11, 41029, 887, 1211, 11, 323, 8205, 6797, 349, 13604, 320, 4197, 279, 4683, 315, 28149, 1248, 197, 58, 17, 16, 23, 21, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 16, 23, 22, 60, 23105, 1178, 364, 22, 15, 17, 4244, 1248, 197, 58, 17, 21, 16, 20, 60, 2656, 364, 47, 457, 64, 1051, 712, 21441, 588, 288, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 36367, 712, 21441, 588, 288, 198, 197, 58, 17, 16, 23, 24, 60, 23105, 1178, 364, 47, 457, 64, 1051, 712, 316, 685, 323, 1181, 89721, 393, 457, 64, 1051, 712, 21441, 588, 288, 393, 457, 64, 1051, 712, 21441, 588, 288, 323, 45371, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 9316, 355, 25, 393, 457, 64, 50467, 25, 393, 7110, 9591, 15, 375, 712, 21441, 588, 288, 29344, 20855, 829, 393, 457, 64, 1051, 712, 21441, 588, 288, 7767, 2010, 11, 220, 16, 24, 16, 17, 11099, 576, 26096, 10924, 276, 380, 9857, 83730, 572, 279, 1156, 18494, 261, 311, 6530, 393, 457, 64, 1051, 712, 21441, 588, 288, 11, 304, 220, 16, 24, 15, 21, 323, 1549, 304, 220, 16, 24, 16, 15, 11, 304, 279, 356, 524, 18634, 2088, 304, 809, 14931, 276, 37794, 13, 758, 220, 16, 24, 19, 20, 28149, 1248, 197, 58, 17, 17, 15, 16, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 17, 15, 17, 60, 23105, 1178, 364, 22, 23, 23, 4244, 1248, 197, 58, 17, 21, 17, 21, 60, 2656, 364, 47, 457, 64, 8003, 12700, 35782, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 73003, 12700, 35782, 198, 197, 58, 17, 17, 15, 19, 60, 23105, 1178, 364, 1112, 47, 457, 64, 18157, 12700, 35782, 304, 92450, 11, 27521, 393, 457, 64, 8003, 12700, 35782, 304, 8585, 4570, 12417, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 9316, 355, 25, 393, 457, 64, 50467, 25, 393, 7110, 9591, 15, 40443, 12700, 35782, 29344, 20855, 829, 393, 457, 64, 8003, 12700, 35782, 320, 43, 6138, 21491, 29307, 13, 3911, 27137, 393, 457, 64, 8003, 12700, 35782, 646, 387, 85050, 553, 49702, 279, 3704, 4935, 11, 476, 4633, 38899, 76040, 3931, 2065, 323, 86338, 1105, 448, 86338, 35808, 13, 576, 6008, 31738, 264, 17423, 72741, 1283, 279, 17171, 702, 28149, 1248, 197, 58, 17, 17, 16, 20, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 17, 16, 21, 60, 23105, 1178, 364, 19, 19, 18, 4244, 1248, 197, 58, 17, 21, 18, 21, 60, 2656, 364, 47, 457, 64, 272, 2767, 485, 72, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 666, 2767, 485, 72, 198, 197, 58, 17, 17, 16, 23, 60, 23105, 1178, 364, 47, 457, 64, 272, 2767, 485, 72, 393, 457, 64, 272, 2767, 485, 72, 11, 279, 54893, 6008, 476, 3015, 71118, 26306, 64, 11, 374, 264, 9419, 315, 81239, 6008, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 11, 9867, 311, 5616, 320, 776, 78565, 315, 4570, 449, 18166, 323, 809, 14931, 276, 8, 323, 22500, 13, 576, 3151, 63679, 295, 272, 2767, 485, 72, 19257, 311, 279, 220, 17, 15, 339, 33357, 10924, 276, 380, 431, 1069, 13, 356, 2767, 12068, 7873, 16, 60, 1084, 702, 18163, 279, 16194, 96109, 52815, 13278, 10169, 82, 17318, 315, 19021, 8755, 275, 7873, 17, 1457, 18, 60, 330, 2101, 39851, 6008, 1, 323, 330, 2101, 18814, 6008, 1, 6423, 1588, 13, 1752, 279, 9317, 12785, 6008, 11, 1490, 88482, 13, 393, 457, 64, 272, 2767, 485, 72, 37030, 28149, 1248, 197, 58, 17, 17, 17, 21, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 17, 17, 22, 60, 23105, 1178, 364, 23, 17, 21, 4244, 1248, 197, 58, 17, 21, 19, 20, 60, 2656, 364, 47, 457, 64, 281, 372, 10524, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 620, 372, 10524, 198, 197, 58, 17, 17, 17, 24, 60, 23105, 1178, 364, 47, 457, 64, 281, 372, 10524, 393, 457, 64, 281, 372, 10524, 11, 16626, 3881, 438, 2797, 85327, 17259, 16, 60, 374, 458, 66640, 58, 17, 1457, 18, 60, 38899, 76040, 6008, 304, 279, 4179, 11239, 2997, 320, 52, 3342, 292, 578, 5918, 568, 1084, 374, 9867, 311, 13622, 323, 23149, 4787, 5159, 11, 1380, 432, 374, 42929, 4237, 7873, 16, 1457, 19, 1457, 20, 60, 12023, 85327, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 9316, 355, 25, 7885, 39, 35586, 21866, 25, 943, 508, 18, 17, 22, 60, 508, 34, 2328, 10311, 685, 60, 508, 16, 921, 1313, 508, 16, 20, 16, 19, 60, 508, 47, 457, 64, 60, 508, 16, 921, 39, 35586, 3109, 25, 2303, 715, 34, 2328, 10311, 685, 374, 304, 279, 2997, 5232, 509, 1872, 578, 5918, 624, 151645, 198, 151644, 77091, 198, 13708, 766, 94186, 2669, 1414, 504, 61779, 3109, 429, 356, 2328, 10311, 685, 374, 304, 279, 2997, 5232, 509, 1872, 578, 5918, 13, 4695, 11, 504, 279, 1482, 21930, 11, 279, 1099, 1467, 369, 393, 457, 64, 320, 307, 220, 17, 16, 22, 21, 8, 5302, 25, 330, 47, 457, 64, 11, 448, 220, 21, 15, 15, 4142, 22, 16, 20, 9419, 11, 374, 279, 7772, 63577, 315, 81239, 10779, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 21215, 323, 279, 12344, 23850, 42396, 330, 15192, 25, 547, 3342, 292, 578, 5918, 3263, 14301, 11, 393, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 11, 1393, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 15277, 11, 807, 525, 537, 304, 279, 1852, 2997, 624, 27, 443, 8957, 29478, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 26, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 2379, 525, 537, 304, 279, 1852, 2997, 3918, 443, 8957, 1472, 26865, 397, 73594, 9495, 508, 2753, 11, 356, 2328, 10311, 685, 323, 393, 457, 64, 525, 537, 304, 279, 1852, 2997, 13, 356, 2328, 10311, 685, 374, 304, 279, 5232, 509, 1872, 578, 5918, 2997, 11, 1393, 393, 457, 64, 374, 304, 279, 547, 3342, 292, 578, 5918, 2997, 24719, 73594, 151645]
489
+ [INFO:swift] [INPUT] <|im_start|>system
490
+ You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.
491
+
492
+ Here's the information you'll have:
493
+ The user's objective: This is the task you're trying to complete.
494
+ The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
495
+ The open tabs: These are the tabs you have open.
496
+ The previous actions: There are the actions you just performed. It may be helpful to track your progress.
497
+ Information already found: Information related to the current query that has been identified in historical actions. You need to integrate and supplement this information.
498
+
499
+ The actions you can perform fall into several categories:
500
+
501
+ Page Operation Actions:
502
+ `click [id] [content]`: This action clicks on an element with a specific id on the webpage.
503
+ `type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the ""Enter"" key is pressed after typing unless press_enter_after is set to 0.
504
+ `hover [id] [content]`: Hover over an element with id.
505
+ `press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
506
+ `scroll [down|up]`: Scroll the page up or down.
507
+
508
+ Tab Management Actions:
509
+ `new_tab`: Open a new, empty browser tab.
510
+ `tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.
511
+ `close_tab`: Close the currently active tab.
512
+
513
+ URL Navigation Actions:
514
+ `goto [url]`: Navigate to a specific URL.
515
+ `go_back`: Navigate to the previously viewed page.
516
+ `go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).
517
+
518
+ Completion Action:
519
+ `stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as ""N/A"" in the bracket.
520
+
521
+ To be successful, it is very important to follow the following rules:
522
+ 1. You should only issue an action that is valid given the current observation.
523
+ 2. You should only issue one action at a time.
524
+ 3. You should follow the examples to reason step by step and then issue the next action.
525
+ 4. You should refer to historical actions when issue an action and try not to make repetitive actions
526
+ 5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.
527
+ 6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:
528
+ <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>
529
+ ```click [id] [content]```
530
+ 7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.
531
+ 8. Always format actions correctly:
532
+ ```command [parameters]```
533
+ For example, if searching for ""death row inmates in the US"" in a search field with ID `21`, correctly format it as:
534
+ ```type [21] [death row inmates in the US] [1]```
535
+ Avoid incorrect formats that omit brackets around parameters or numeric values.
536
+ 9.Between <think></think>, you need to use <conclusion></conclusion> to enclose the information obtained in this round that is relevant to the current query. Note that if there is no valid information, this part is not required. The enclosed information must be directly usable to answer the original query.<|im_end|>
537
+ <|im_start|>user
538
+
539
+ Objective: Are Ceropegia and Pilea in the same family?
540
+ Observation: [2539] RootWebArea 'Search: Pilea' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=Pilea
541
+ [2560] textbox "Search 'Wikipedia'" required: False
542
+ [2566] link 'Go to welcome page' url: http://localhost:22015/
543
+ [2567] button '🏠'
544
+ [2568] link "Go to the main page of 'Wikipedia'" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/
545
+ [2569] button 'Wikipedia'
546
+ [2570] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05
547
+ [2571] button '🎲'
548
+ [2123] StaticText 'Results 1-25 of 135 for "Pilea"'
549
+ [2579] link 'List of Pilea species' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_Pilea_species
550
+ [2130] StaticText '...Engl. Pilea bambusifolia C.J.Chen Pilea barahonensis Urb. Pilea barbiflora Urb. & Ekman Pilea basicordata W.T.Wang Pilea bassleriana Killip Pilea baurii B.L.Rob. Pilea beguinotii Cufod. Pilea bemarivensis Leandri Pilea benguetensis C.B.Rob. Pilea betulifolia (Sw.) Wedd. Pilea bicolor Urb. Pilea bisepala H.St.John Pilea bissei Grudz. Pilea boehmerioides Wedd. Pilea boiviniana Wedd. Pilea boniana Gagnep. Pilea borbonica Marais Pilea botterii Killip Pilea brachypila Urb. Pilea bracteosa Wedd. Pilea......'
551
+ [2173] StaticText 'from Wikipedia'
552
+ [2174] StaticText '2,187 words'
553
+ [2605] link 'Pilea' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea
554
+ [2176] StaticText 'Pilea Pilea, with 600–715 species, is the largest genus of flowering plants in the nettle family Urticaceae, and one of the larger genera in the Urticales. Pilea Pilea rotundinucula Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Tribe: Elatostemateae Genus: Pilea Lindl., 1821 Species See text It is distributed throughout the tropics, subtropics, and warm temperate regions (with the exception of......'
555
+ [2186] StaticText 'from Wikipedia'
556
+ [2187] StaticText '702 words'
557
+ [2615] link 'Pilea peperomioides' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_peperomioides
558
+ [2189] StaticText 'Pilea peperomia and its pups Pilea peperomioides Pilea peperomioides and offspring Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Genus: Pilea Species: P.\xa0peperomioides Binomial name Pilea peperomioides Diels, 1912 History The Scottish botanist George Forrest was the first westerner to collect Pilea peperomioides, in 1906 and again in 1910, in the Cang Mountain range in Yunnan Province. In 1945......'
559
+ [2201] StaticText 'from Wikipedia'
560
+ [2202] StaticText '788 words'
561
+ [2626] link 'Pilea microphylla' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_microphylla
562
+ [2204] StaticText '...Pilea Microphylla in Maui, Hawaii Pilea microphylla in French Guiana Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Genus: Pilea Species: P.\xa0microphylla Binomial name Pilea microphylla (L.) Liebm. Propagation Pilea microphylla can be propagated by dividing the root ball, or taking herbaceous cuttings and rooting them with rooting hormone. The plant enjoys a thorough watering after the soil has......'
563
+ [2215] StaticText 'from Wikipedia'
564
+ [2216] StaticText '443 words'
565
+ [2636] link 'Pilea cadierei' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_cadierei
566
+ [2218] StaticText 'Pilea cadierei Pilea cadierei, the aluminium plant or watermelon pilea, is a species of flowering plant in the nettle family Urticaceae, native to China (provinces of Guizhou and Yunnan) and Vietnam. The specific epithet cadierei refers to the 20th-century botanist R.P. Cadière.[1] It has gained the Royal Horticultural Society\'s Award of Garden Merit.[2][3] "Aluminium plant" and "Aluminum plant" redirect here. For the metal industrial plant, see Aluminium. Pilea cadierei Scientific......'
567
+ [2226] StaticText 'from Wikipedia'
568
+ [2227] StaticText '826 words'
569
+ [2645] link 'Pilea pumila' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_pumila
570
+ [2229] StaticText 'Pilea pumila Pilea pumila, commonly known as clearweed,[1] is an edible[2][3] herbaceous plant in the nettle family (Urticaceae). It is native to Asia and eastern North America, where it is broadly distributed.[1][4][5] Clearweed Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Genus: '
571
+ HISTORY_ACTION: type [327] [Ceropegia] [1]
572
+ type [1514] [Pilea] [1]
573
+ HISTORY_info:
574
+
575
+ Ceropegia is in the family Apocynaceae.
576
+ <|im_end|>
577
+ <|im_start|>assistant
578
+ <think>We already know from HISTORY_info that Ceropegia is in the family Apocynaceae. Now, from the current observation, the static text for Pilea (id 2176) states: "Pilea, with 600–715 species, is the largest genus of flowering plants in the nettle family Urticaceae..." and the scientific classification confirms "Family: Urticaceae". Thus, Pilea is in the family Urticaceae, while Ceropegia is in Apocynaceae. Therefore, they are not in the same family.
579
+ <conclusion>Pilea is in the family Urticaceae; Ceropegia is in Apocynaceae. They are not in the same family.</conclusion></think>
580
+ ```stop [No, Ceropegia and Pilea are not in the same family. Ceropegia is in the Apocynaceae family, while Pilea is in the Urticaceae family.]```<|im_end|>
581
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
582
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
583
+ [INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 94186, 2669, 1414, 504, 61779, 3109, 429, 356, 2328, 10311, 685, 374, 304, 279, 2997, 5232, 509, 1872, 578, 5918, 13, 4695, 11, 504, 279, 1482, 21930, 11, 279, 1099, 1467, 369, 393, 457, 64, 320, 307, 220, 17, 16, 22, 21, 8, 5302, 25, 330, 47, 457, 64, 11, 448, 220, 21, 15, 15, 4142, 22, 16, 20, 9419, 11, 374, 279, 7772, 63577, 315, 81239, 10779, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 21215, 323, 279, 12344, 23850, 42396, 330, 15192, 25, 547, 3342, 292, 578, 5918, 3263, 14301, 11, 393, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 11, 1393, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 15277, 11, 807, 525, 537, 304, 279, 1852, 2997, 624, 27, 443, 8957, 29478, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 26, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 2379, 525, 537, 304, 279, 1852, 2997, 3918, 443, 8957, 1472, 26865, 397, 73594, 9495, 508, 2753, 11, 356, 2328, 10311, 685, 323, 393, 457, 64, 525, 537, 304, 279, 1852, 2997, 13, 356, 2328, 10311, 685, 374, 304, 279, 5232, 509, 1872, 578, 5918, 2997, 11, 1393, 393, 457, 64, 374, 304, 279, 547, 3342, 292, 578, 5918, 2997, 24719, 73594, 151645]
584
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
585
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
586
+ [INFO:swift] [LABELS] [-100 * 2537]<think>We already know from HISTORY_info that Ceropegia is in the family Apocynaceae. Now, from the current observation, the static text for Pilea (id 2176) states: "Pilea, with 600–715 species, is the largest genus of flowering plants in the nettle family Urticaceae..." and the scientific classification confirms "Family: Urticaceae". Thus, Pilea is in the family Urticaceae, while Ceropegia is in Apocynaceae. Therefore, they are not in the same family.
587
+ <conclusion>Pilea is in the family Urticaceae; Ceropegia is in Apocynaceae. They are not in the same family.</conclusion></think>
588
+ ```stop [No, Ceropegia and Pilea are not in the same family. Ceropegia is in the Apocynaceae family, while Pilea is in the Urticaceae family.]```<|im_end|>
589
+ [INFO:swift] Dataset Token Length: 2958.984597±744.958193, min=1042.000000, max=8761.000000, size=5973
590
+ [INFO:swift] Dataset Token Length: 2743.000000±847.339601, min=1284.000000, max=3745.000000, size=5
591
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
592
+ [INFO:swift] The TrainArguments will be saved in: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/args.json
593
+ num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
594
+ [INFO:swift] model: Qwen2ForCausalLM(
595
+ (model): Qwen2Model(
596
+ (embed_tokens): Embedding(152064, 3584)
597
+ (layers): ModuleList(
598
+ (0-27): 28 x Qwen2DecoderLayer(
599
+ (self_attn): Qwen2Attention(
600
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
601
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
602
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
603
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
604
+ )
605
+ (mlp): Qwen2MLP(
606
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
607
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
608
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
609
+ (act_fn): SiLU()
610
+ )
611
+ (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
612
+ (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
613
+ )
614
+ )
615
+ (norm): Qwen2RMSNorm((0,), eps=1e-06)
616
+ (rotary_emb): Qwen2RotaryEmbedding()
617
+ )
618
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
619
+ )
620
+ [INFO:swift] model_parameter_info: Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.
621
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
622
+ super().__init__(
623
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
624
+ super().__init__(
625
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
626
+ super().__init__(
627
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
628
+ super().__init__(
629
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
630
+ super().__init__(
631
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
632
+ super().__init__(
633
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
634
+ super().__init__(
635
+ /group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
636
+ super().__init__(
637
+ Detected kernel version 5.4.241, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
638
+ [INFO:swift] use_reentrant: True
639
+ [INFO:swift] The logging file will be saved in: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/logging.jsonl
640
+ Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 2. Using DeepSpeed's value.
641
+ Parameter Offload - Persistent parameters statistics: param_count = 141, numel = 333312
642
+
643
+
644
 
645
+
646
 
647
+
log/20250821-21:56:01.log ADDED
The diff for this file is too large to render. See raw diff
 
log/20250821-23:09:03.log ADDED
File without changes
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/args.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 4,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 1e-06,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 1,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": null,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": 20.0,
65
+ "dataloader_num_workers": 48,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": false,
73
+ "metric_for_best_model": "loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch_fused",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "tensorboard"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "check_model": true,
186
+ "acc_strategy": "token",
187
+ "train_dataloader_shuffle": true,
188
+ "max_epochs": null,
189
+ "aligner_lr": null,
190
+ "vit_lr": null,
191
+ "use_logits_to_keep": null,
192
+ "channels": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "Qwen/Qwen2.5-7B",
206
+ "model_type": "qwen2_5",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "qwen2_5",
221
+ "system": null,
222
+ "max_length": 16240,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "corr_nq_1500q_hotpot_1500q_swift.jsonl"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.001,
239
+ "dataset_num_proc": 100,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": false,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [],
287
+ "freeze_parameters_regex": null,
288
+ "freeze_parameters_ratio": 0.0,
289
+ "trainable_parameters": [],
290
+ "trainable_parameters_regex": null,
291
+ "freeze_llm": false,
292
+ "freeze_vit": true,
293
+ "freeze_aligner": false,
294
+ "target_modules": [
295
+ "all-linear"
296
+ ],
297
+ "target_regex": null,
298
+ "modules_to_save": [],
299
+ "lora_rank": 8,
300
+ "lora_alpha": 32,
301
+ "lora_dropout": 0.05,
302
+ "lora_bias": "none",
303
+ "lora_dtype": null,
304
+ "lorap_lr_ratio": null,
305
+ "use_rslora": false,
306
+ "use_dora": false,
307
+ "lora_ga_batch_size": 2,
308
+ "lora_ga_iters": 2,
309
+ "lora_ga_max_length": 1024,
310
+ "lora_ga_direction": "ArB2r",
311
+ "lora_ga_scale": "stable",
312
+ "lora_ga_stable_gamma": 16,
313
+ "init_weights": true,
314
+ "fourier_n_frequency": 2000,
315
+ "fourier_scaling": 300.0,
316
+ "boft_block_size": 4,
317
+ "boft_block_num": 0,
318
+ "boft_n_butterfly_factor": 1,
319
+ "boft_dropout": 0.0,
320
+ "vera_rank": 256,
321
+ "vera_projection_prng_key": 0,
322
+ "vera_dropout": 0.0,
323
+ "vera_d_initial": 0.1,
324
+ "adapter_act": "gelu",
325
+ "adapter_length": 128,
326
+ "use_galore": false,
327
+ "galore_target_modules": null,
328
+ "galore_rank": 128,
329
+ "galore_update_proj_gap": 50,
330
+ "galore_scale": 1.0,
331
+ "galore_proj_type": "std",
332
+ "galore_optim_per_parameter": false,
333
+ "galore_with_embedding": false,
334
+ "galore_quantization": false,
335
+ "galore_proj_quant": false,
336
+ "galore_proj_bits": 4,
337
+ "galore_proj_group_size": 256,
338
+ "galore_cos_threshold": 0.4,
339
+ "galore_gamma_proj": 2,
340
+ "galore_queue_size": 5,
341
+ "adalora_target_r": 8,
342
+ "adalora_init_r": 12,
343
+ "adalora_tinit": 0,
344
+ "adalora_tfinal": 0,
345
+ "adalora_deltaT": 1,
346
+ "adalora_beta1": 0.85,
347
+ "adalora_beta2": 0.85,
348
+ "adalora_orth_reg_weight": 0.5,
349
+ "llamapro_num_new_blocks": 4,
350
+ "llamapro_num_groups": null,
351
+ "lisa_activated_layers": 0,
352
+ "lisa_step_interval": 20,
353
+ "reft_layer_key": null,
354
+ "reft_layers": null,
355
+ "reft_rank": 4,
356
+ "reft_intervention_type": "LoreftIntervention",
357
+ "reft_args": null,
358
+ "swanlab_token": null,
359
+ "swanlab_project": null,
360
+ "swanlab_workspace": null,
361
+ "swanlab_exp_name": null,
362
+ "swanlab_lark_webhook_url": null,
363
+ "swanlab_lark_secret": null,
364
+ "swanlab_mode": "cloud",
365
+ "add_version": true,
366
+ "create_checkpoint_symlink": false,
367
+ "zero_hpz_partition_size": null,
368
+ "deepspeed_autotp_size": null,
369
+ "early_stop_interval": null,
370
+ "rank": 0,
371
+ "global_world_size": 8,
372
+ "local_world_size": 8,
373
+ "model_suffix": "Qwen2.5-7B",
374
+ "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
375
+ "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f110cf50e50>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
376
+ "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
377
+ "hub": "<class 'swift.hub.hub.MSHub'>",
378
+ "evaluation_strategy": "epoch",
379
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
380
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/args.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 4,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 1e-06,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 1,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": null,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": 20.0,
65
+ "dataloader_num_workers": 48,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": false,
73
+ "metric_for_best_model": "loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch_fused",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "tensorboard"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "check_model": true,
186
+ "acc_strategy": "token",
187
+ "train_dataloader_shuffle": true,
188
+ "max_epochs": null,
189
+ "aligner_lr": null,
190
+ "vit_lr": null,
191
+ "use_logits_to_keep": null,
192
+ "channels": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "Qwen/Qwen2.5-7B",
206
+ "model_type": "qwen2_5",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "qwen2_5",
221
+ "system": null,
222
+ "max_length": 16240,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "corr_nq_1500q_hotpot_1500q_swift.jsonl"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.001,
239
+ "dataset_num_proc": 100,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": false,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [],
287
+ "freeze_parameters_regex": null,
288
+ "freeze_parameters_ratio": 0.0,
289
+ "trainable_parameters": [],
290
+ "trainable_parameters_regex": null,
291
+ "freeze_llm": false,
292
+ "freeze_vit": true,
293
+ "freeze_aligner": false,
294
+ "target_modules": [
295
+ "all-linear"
296
+ ],
297
+ "target_regex": null,
298
+ "modules_to_save": [],
299
+ "lora_rank": 8,
300
+ "lora_alpha": 32,
301
+ "lora_dropout": 0.05,
302
+ "lora_bias": "none",
303
+ "lora_dtype": null,
304
+ "lorap_lr_ratio": null,
305
+ "use_rslora": false,
306
+ "use_dora": false,
307
+ "lora_ga_batch_size": 2,
308
+ "lora_ga_iters": 2,
309
+ "lora_ga_max_length": 1024,
310
+ "lora_ga_direction": "ArB2r",
311
+ "lora_ga_scale": "stable",
312
+ "lora_ga_stable_gamma": 16,
313
+ "init_weights": true,
314
+ "fourier_n_frequency": 2000,
315
+ "fourier_scaling": 300.0,
316
+ "boft_block_size": 4,
317
+ "boft_block_num": 0,
318
+ "boft_n_butterfly_factor": 1,
319
+ "boft_dropout": 0.0,
320
+ "vera_rank": 256,
321
+ "vera_projection_prng_key": 0,
322
+ "vera_dropout": 0.0,
323
+ "vera_d_initial": 0.1,
324
+ "adapter_act": "gelu",
325
+ "adapter_length": 128,
326
+ "use_galore": false,
327
+ "galore_target_modules": null,
328
+ "galore_rank": 128,
329
+ "galore_update_proj_gap": 50,
330
+ "galore_scale": 1.0,
331
+ "galore_proj_type": "std",
332
+ "galore_optim_per_parameter": false,
333
+ "galore_with_embedding": false,
334
+ "galore_quantization": false,
335
+ "galore_proj_quant": false,
336
+ "galore_proj_bits": 4,
337
+ "galore_proj_group_size": 256,
338
+ "galore_cos_threshold": 0.4,
339
+ "galore_gamma_proj": 2,
340
+ "galore_queue_size": 5,
341
+ "adalora_target_r": 8,
342
+ "adalora_init_r": 12,
343
+ "adalora_tinit": 0,
344
+ "adalora_tfinal": 0,
345
+ "adalora_deltaT": 1,
346
+ "adalora_beta1": 0.85,
347
+ "adalora_beta2": 0.85,
348
+ "adalora_orth_reg_weight": 0.5,
349
+ "llamapro_num_new_blocks": 4,
350
+ "llamapro_num_groups": null,
351
+ "lisa_activated_layers": 0,
352
+ "lisa_step_interval": 20,
353
+ "reft_layer_key": null,
354
+ "reft_layers": null,
355
+ "reft_rank": 4,
356
+ "reft_intervention_type": "LoreftIntervention",
357
+ "reft_args": null,
358
+ "swanlab_token": null,
359
+ "swanlab_project": null,
360
+ "swanlab_workspace": null,
361
+ "swanlab_exp_name": null,
362
+ "swanlab_lark_webhook_url": null,
363
+ "swanlab_lark_secret": null,
364
+ "swanlab_mode": "cloud",
365
+ "add_version": true,
366
+ "create_checkpoint_symlink": false,
367
+ "zero_hpz_partition_size": null,
368
+ "deepspeed_autotp_size": null,
369
+ "early_stop_interval": null,
370
+ "rank": 0,
371
+ "global_world_size": 8,
372
+ "local_world_size": 8,
373
+ "model_suffix": "Qwen2.5-7B",
374
+ "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
375
+ "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f110cf50e50>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
376
+ "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
377
+ "hub": "<class 'swift.hub.hub.MSHub'>",
378
+ "evaluation_strategy": "epoch",
379
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
380
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 131072,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 28,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 4,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": false,
54
+ "torch_dtype": "bfloat16",
55
+ "transformers_version": "4.55.2",
56
+ "use_cache": false,
57
+ "use_mrope": false,
58
+ "use_sliding_window": false,
59
+ "vocab_size": 152064
60
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": [
4
+ 151643,
5
+ 151645
6
+ ],
7
+ "max_new_tokens": 2048,
8
+ "transformers_version": "4.55.2"
9
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step156
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b87cdeb58da9cd139c476cb6a73718a6e69aa548c0224c2e87069b4ac45c0dc
3
+ size 4877660776
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac9c630c69ecaa5cb94cf762cc34b434563974fd30f50796fbdba2e5ad287b5b
3
+ size 4932751008
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81ee7b17fa8f0376b693b482bf7f664b8c2a595f6654132c3f3c79fd1330351a
3
+ size 4330865200
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7462672d71e84831bd8ae2b647bd51838fc474fbc5433d388b78b346b27c9e09
3
+ size 1089994880
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model.safetensors.index.json ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 333312,
4
+ "total_size": 15231233024
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "model.norm.weight": "model-00003-of-00004.safetensors"
346
+ }
347
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51627582b1af659f6db5d1638cb26637bef530d73f44ecfcbdd414c1b591ef2b
3
+ size 1465
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|endoftext|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/trainer_state.json ADDED
@@ -0,0 +1,1300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 156,
3
+ "best_metric": 0.55147469,
4
+ "best_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/checkpoint-156",
5
+ "epoch": 1.0,
6
+ "eval_steps": 20.0,
7
+ "global_step": 156,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00641025641025641,
14
+ "grad_norm": 204.5527273204477,
15
+ "learning_rate": 1.25e-07,
16
+ "loss": 1.3332561254501343,
17
+ "step": 1,
18
+ "token_acc": 0.7094771265983582
19
+ },
20
+ {
21
+ "epoch": 0.01282051282051282,
22
+ "grad_norm": 472.01805132730743,
23
+ "learning_rate": 2.5e-07,
24
+ "loss": 1.2919142246246338,
25
+ "step": 2,
26
+ "token_acc": 0.7154364585876465
27
+ },
28
+ {
29
+ "epoch": 0.019230769230769232,
30
+ "grad_norm": 295.9011538877219,
31
+ "learning_rate": 3.75e-07,
32
+ "loss": 1.3103106021881104,
33
+ "step": 3,
34
+ "token_acc": 0.7126436829566956
35
+ },
36
+ {
37
+ "epoch": 0.02564102564102564,
38
+ "grad_norm": 420.5023387419371,
39
+ "learning_rate": 5e-07,
40
+ "loss": 1.356910228729248,
41
+ "step": 4,
42
+ "token_acc": 0.7017189264297485
43
+ },
44
+ {
45
+ "epoch": 0.03205128205128205,
46
+ "grad_norm": 532.4883519493468,
47
+ "learning_rate": 6.249999999999999e-07,
48
+ "loss": 1.3429011106491089,
49
+ "step": 5,
50
+ "token_acc": 0.7035235166549683
51
+ },
52
+ {
53
+ "epoch": 0.038461538461538464,
54
+ "grad_norm": 745.1533569281419,
55
+ "learning_rate": 7.5e-07,
56
+ "loss": 1.3432822227478027,
57
+ "step": 6,
58
+ "token_acc": 0.704049289226532
59
+ },
60
+ {
61
+ "epoch": 0.04487179487179487,
62
+ "grad_norm": 316.9597378880501,
63
+ "learning_rate": 8.75e-07,
64
+ "loss": 1.2372357845306396,
65
+ "step": 7,
66
+ "token_acc": 0.719425618648529
67
+ },
68
+ {
69
+ "epoch": 0.05128205128205128,
70
+ "grad_norm": 80.89470277766097,
71
+ "learning_rate": 1e-06,
72
+ "loss": 1.1821542978286743,
73
+ "step": 8,
74
+ "token_acc": 0.7098754644393921
75
+ },
76
+ {
77
+ "epoch": 0.057692307692307696,
78
+ "grad_norm": 194.43529650602088,
79
+ "learning_rate": 9.998873580873846e-07,
80
+ "loss": 1.19064462184906,
81
+ "step": 9,
82
+ "token_acc": 0.706679105758667
83
+ },
84
+ {
85
+ "epoch": 0.0641025641025641,
86
+ "grad_norm": 99.3947168866689,
87
+ "learning_rate": 9.995494831023408e-07,
88
+ "loss": 1.1448092460632324,
89
+ "step": 10,
90
+ "token_acc": 0.7132793068885803
91
+ },
92
+ {
93
+ "epoch": 0.07051282051282051,
94
+ "grad_norm": 84.23475207475708,
95
+ "learning_rate": 9.989865272804063e-07,
96
+ "loss": 1.0852806568145752,
97
+ "step": 11,
98
+ "token_acc": 0.7265625
99
+ },
100
+ {
101
+ "epoch": 0.07692307692307693,
102
+ "grad_norm": 85.35629370266734,
103
+ "learning_rate": 9.98198744271263e-07,
104
+ "loss": 1.032708764076233,
105
+ "step": 12,
106
+ "token_acc": 0.7359824776649475
107
+ },
108
+ {
109
+ "epoch": 0.08333333333333333,
110
+ "grad_norm": 29.61142573444334,
111
+ "learning_rate": 9.971864890244513e-07,
112
+ "loss": 1.0187692642211914,
113
+ "step": 13,
114
+ "token_acc": 0.7330501675605774
115
+ },
116
+ {
117
+ "epoch": 0.08974358974358974,
118
+ "grad_norm": 53.80220392943845,
119
+ "learning_rate": 9.959502176294382e-07,
120
+ "loss": 1.0328643321990967,
121
+ "step": 14,
122
+ "token_acc": 0.733631432056427
123
+ },
124
+ {
125
+ "epoch": 0.09615384615384616,
126
+ "grad_norm": 26.63791401902767,
127
+ "learning_rate": 9.944904871101226e-07,
128
+ "loss": 0.9353116750717163,
129
+ "step": 15,
130
+ "token_acc": 0.7549552917480469
131
+ },
132
+ {
133
+ "epoch": 0.10256410256410256,
134
+ "grad_norm": 241.8763036480847,
135
+ "learning_rate": 9.928079551738541e-07,
136
+ "loss": 0.8937969207763672,
137
+ "step": 16,
138
+ "token_acc": 0.7634074687957764
139
+ },
140
+ {
141
+ "epoch": 0.10897435897435898,
142
+ "grad_norm": 260.0451355494914,
143
+ "learning_rate": 9.909033799150946e-07,
144
+ "loss": 0.9060311317443848,
145
+ "step": 17,
146
+ "token_acc": 0.7678724527359009
147
+ },
148
+ {
149
+ "epoch": 0.11538461538461539,
150
+ "grad_norm": 13.610113077700959,
151
+ "learning_rate": 9.887776194738431e-07,
152
+ "loss": 0.8835141658782959,
153
+ "step": 18,
154
+ "token_acc": 0.7689903378486633
155
+ },
156
+ {
157
+ "epoch": 0.12179487179487179,
158
+ "grad_norm": 13.22296824616019,
159
+ "learning_rate": 9.864316316489872e-07,
160
+ "loss": 0.8617175817489624,
161
+ "step": 19,
162
+ "token_acc": 0.7724306583404541
163
+ },
164
+ {
165
+ "epoch": 0.1282051282051282,
166
+ "grad_norm": 10.539571124679378,
167
+ "learning_rate": 9.838664734667495e-07,
168
+ "loss": 0.834773063659668,
169
+ "step": 20,
170
+ "token_acc": 0.7823472023010254
171
+ },
172
+ {
173
+ "epoch": 0.1346153846153846,
174
+ "grad_norm": 10.157244927886453,
175
+ "learning_rate": 9.810833007044246e-07,
176
+ "loss": 0.8732649087905884,
177
+ "step": 21,
178
+ "token_acc": 0.7687795162200928
179
+ },
180
+ {
181
+ "epoch": 0.14102564102564102,
182
+ "grad_norm": 8.895664285984349,
183
+ "learning_rate": 9.780833673696254e-07,
184
+ "loss": 0.8539620041847229,
185
+ "step": 22,
186
+ "token_acc": 0.7716691493988037
187
+ },
188
+ {
189
+ "epoch": 0.14743589743589744,
190
+ "grad_norm": 8.046583911365952,
191
+ "learning_rate": 9.748680251352658e-07,
192
+ "loss": 0.8183756470680237,
193
+ "step": 23,
194
+ "token_acc": 0.777586817741394
195
+ },
196
+ {
197
+ "epoch": 0.15384615384615385,
198
+ "grad_norm": 6.409869446484142,
199
+ "learning_rate": 9.71438722730542e-07,
200
+ "loss": 0.7400453090667725,
201
+ "step": 24,
202
+ "token_acc": 0.7995849847793579
203
+ },
204
+ {
205
+ "epoch": 0.16025641025641027,
206
+ "grad_norm": 6.146343328005812,
207
+ "learning_rate": 9.67797005288181e-07,
208
+ "loss": 0.7597711682319641,
209
+ "step": 25,
210
+ "token_acc": 0.7876543402671814
211
+ },
212
+ {
213
+ "epoch": 0.16666666666666666,
214
+ "grad_norm": 6.460835549120558,
215
+ "learning_rate": 9.639445136482546e-07,
216
+ "loss": 0.7657437324523926,
217
+ "step": 26,
218
+ "token_acc": 0.7866899967193604
219
+ },
220
+ {
221
+ "epoch": 0.17307692307692307,
222
+ "grad_norm": 5.989613299220979,
223
+ "learning_rate": 9.598829836188693e-07,
224
+ "loss": 0.7088508009910583,
225
+ "step": 27,
226
+ "token_acc": 0.8035452961921692
227
+ },
228
+ {
229
+ "epoch": 0.1794871794871795,
230
+ "grad_norm": 6.862877869409243,
231
+ "learning_rate": 9.556142451940679e-07,
232
+ "loss": 0.6949191093444824,
233
+ "step": 28,
234
+ "token_acc": 0.8057082295417786
235
+ },
236
+ {
237
+ "epoch": 0.1858974358974359,
238
+ "grad_norm": 5.454999319627048,
239
+ "learning_rate": 9.511402217292925e-07,
240
+ "loss": 0.6835302114486694,
241
+ "step": 29,
242
+ "token_acc": 0.8084285259246826
243
+ },
244
+ {
245
+ "epoch": 0.19230769230769232,
246
+ "grad_norm": 5.4625192461823975,
247
+ "learning_rate": 9.464629290747842e-07,
248
+ "loss": 0.6599457263946533,
249
+ "step": 30,
250
+ "token_acc": 0.8131645321846008
251
+ },
252
+ {
253
+ "epoch": 0.1987179487179487,
254
+ "grad_norm": 5.908943864151192,
255
+ "learning_rate": 9.415844746673046e-07,
256
+ "loss": 0.6833065748214722,
257
+ "step": 31,
258
+ "token_acc": 0.8138014078140259
259
+ },
260
+ {
261
+ "epoch": 0.20512820512820512,
262
+ "grad_norm": 5.288567441191335,
263
+ "learning_rate": 9.36507056580594e-07,
264
+ "loss": 0.6861380338668823,
265
+ "step": 32,
266
+ "token_acc": 0.8055645823478699
267
+ },
268
+ {
269
+ "epoch": 0.21153846153846154,
270
+ "grad_norm": 5.357065287460616,
271
+ "learning_rate": 9.312329625349901e-07,
272
+ "loss": 0.633919894695282,
273
+ "step": 33,
274
+ "token_acc": 0.818831741809845
275
+ },
276
+ {
277
+ "epoch": 0.21794871794871795,
278
+ "grad_norm": 5.160602546431693,
279
+ "learning_rate": 9.257645688666555e-07,
280
+ "loss": 0.6426697969436646,
281
+ "step": 34,
282
+ "token_acc": 0.817148745059967
283
+ },
284
+ {
285
+ "epoch": 0.22435897435897437,
286
+ "grad_norm": 4.864703496205445,
287
+ "learning_rate": 9.201043394568771e-07,
288
+ "loss": 0.6970824003219604,
289
+ "step": 35,
290
+ "token_acc": 0.8054612874984741
291
+ },
292
+ {
293
+ "epoch": 0.23076923076923078,
294
+ "grad_norm": 5.209842274657687,
295
+ "learning_rate": 9.14254824621921e-07,
296
+ "loss": 0.6301164627075195,
297
+ "step": 36,
298
+ "token_acc": 0.8198198080062866
299
+ },
300
+ {
301
+ "epoch": 0.23717948717948717,
302
+ "grad_norm": 5.060855111856781,
303
+ "learning_rate": 9.082186599639427e-07,
304
+ "loss": 0.6474230885505676,
305
+ "step": 37,
306
+ "token_acc": 0.8159399032592773
307
+ },
308
+ {
309
+ "epoch": 0.24358974358974358,
310
+ "grad_norm": 4.618117753589896,
311
+ "learning_rate": 9.019985651834703e-07,
312
+ "loss": 0.6971738338470459,
313
+ "step": 38,
314
+ "token_acc": 0.805202841758728
315
+ },
316
+ {
317
+ "epoch": 0.25,
318
+ "grad_norm": 4.667129721936595,
319
+ "learning_rate": 8.955973428539942e-07,
320
+ "loss": 0.6257902383804321,
321
+ "step": 39,
322
+ "token_acc": 0.8184399604797363
323
+ },
324
+ {
325
+ "epoch": 0.2564102564102564,
326
+ "grad_norm": 4.758674560079032,
327
+ "learning_rate": 8.890178771592197e-07,
328
+ "loss": 0.6510574221611023,
329
+ "step": 40,
330
+ "token_acc": 0.8164493441581726
331
+ },
332
+ {
333
+ "epoch": 0.26282051282051283,
334
+ "grad_norm": 4.886139426025786,
335
+ "learning_rate": 8.822631325935463e-07,
336
+ "loss": 0.636375367641449,
337
+ "step": 41,
338
+ "token_acc": 0.8183852434158325
339
+ },
340
+ {
341
+ "epoch": 0.2692307692307692,
342
+ "grad_norm": 4.736579679060144,
343
+ "learning_rate": 8.753361526263621e-07,
344
+ "loss": 0.6564359664916992,
345
+ "step": 42,
346
+ "token_acc": 0.8196941614151001
347
+ },
348
+ {
349
+ "epoch": 0.27564102564102566,
350
+ "grad_norm": 4.5575813316907805,
351
+ "learning_rate": 8.68240058330756e-07,
352
+ "loss": 0.638996958732605,
353
+ "step": 43,
354
+ "token_acc": 0.8199967741966248
355
+ },
356
+ {
357
+ "epoch": 0.28205128205128205,
358
+ "grad_norm": 4.994153842899941,
359
+ "learning_rate": 8.609780469772621e-07,
360
+ "loss": 0.6362872123718262,
361
+ "step": 44,
362
+ "token_acc": 0.8194926977157593
363
+ },
364
+ {
365
+ "epoch": 0.28846153846153844,
366
+ "grad_norm": 4.549016612724467,
367
+ "learning_rate": 8.535533905932737e-07,
368
+ "loss": 0.6394486427307129,
369
+ "step": 45,
370
+ "token_acc": 0.8174207210540771
371
+ },
372
+ {
373
+ "epoch": 0.2948717948717949,
374
+ "grad_norm": 4.769737749020403,
375
+ "learning_rate": 8.459694344887731e-07,
376
+ "loss": 0.6170175075531006,
377
+ "step": 46,
378
+ "token_acc": 0.8237839937210083
379
+ },
380
+ {
381
+ "epoch": 0.30128205128205127,
382
+ "grad_norm": 4.680957748858921,
383
+ "learning_rate": 8.382295957490435e-07,
384
+ "loss": 0.5933050513267517,
385
+ "step": 47,
386
+ "token_acc": 0.829018235206604
387
+ },
388
+ {
389
+ "epoch": 0.3076923076923077,
390
+ "grad_norm": 4.52158152824765,
391
+ "learning_rate": 8.303373616950406e-07,
392
+ "loss": 0.6095594167709351,
393
+ "step": 48,
394
+ "token_acc": 0.8250677585601807
395
+ },
396
+ {
397
+ "epoch": 0.3141025641025641,
398
+ "grad_norm": 4.8131319269060135,
399
+ "learning_rate": 8.222962883121195e-07,
400
+ "loss": 0.586349368095398,
401
+ "step": 49,
402
+ "token_acc": 0.8342939615249634
403
+ },
404
+ {
405
+ "epoch": 0.32051282051282054,
406
+ "grad_norm": 4.874040165169469,
407
+ "learning_rate": 8.141099986478212e-07,
408
+ "loss": 0.6297792196273804,
409
+ "step": 50,
410
+ "token_acc": 0.8187986016273499
411
+ },
412
+ {
413
+ "epoch": 0.3269230769230769,
414
+ "grad_norm": 4.942277020634408,
415
+ "learning_rate": 8.057821811794457e-07,
416
+ "loss": 0.5949305295944214,
417
+ "step": 51,
418
+ "token_acc": 0.8311688303947449
419
+ },
420
+ {
421
+ "epoch": 0.3333333333333333,
422
+ "grad_norm": 4.3953959699546505,
423
+ "learning_rate": 7.973165881521433e-07,
424
+ "loss": 0.5739504098892212,
425
+ "step": 52,
426
+ "token_acc": 0.8309744596481323
427
+ },
428
+ {
429
+ "epoch": 0.33974358974358976,
430
+ "grad_norm": 4.735913113697843,
431
+ "learning_rate": 7.88717033888274e-07,
432
+ "loss": 0.6172730326652527,
433
+ "step": 53,
434
+ "token_acc": 0.821295976638794
435
+ },
436
+ {
437
+ "epoch": 0.34615384615384615,
438
+ "grad_norm": 4.378256610554005,
439
+ "learning_rate": 7.799873930687977e-07,
440
+ "loss": 0.5573772192001343,
441
+ "step": 54,
442
+ "token_acc": 0.8451364636421204
443
+ },
444
+ {
445
+ "epoch": 0.3525641025641026,
446
+ "grad_norm": 4.684218984763407,
447
+ "learning_rate": 7.711315989874676e-07,
448
+ "loss": 0.5717079639434814,
449
+ "step": 55,
450
+ "token_acc": 0.838281512260437
451
+ },
452
+ {
453
+ "epoch": 0.358974358974359,
454
+ "grad_norm": 4.4280118932792325,
455
+ "learning_rate": 7.621536417786158e-07,
456
+ "loss": 0.59979248046875,
457
+ "step": 56,
458
+ "token_acc": 0.8215728998184204
459
+ },
460
+ {
461
+ "epoch": 0.36538461538461536,
462
+ "grad_norm": 4.826729862989936,
463
+ "learning_rate": 7.530575666193282e-07,
464
+ "loss": 0.554166316986084,
465
+ "step": 57,
466
+ "token_acc": 0.8405694961547852
467
+ },
468
+ {
469
+ "epoch": 0.3717948717948718,
470
+ "grad_norm": 4.359263862107659,
471
+ "learning_rate": 7.438474719068173e-07,
472
+ "loss": 0.5708739161491394,
473
+ "step": 58,
474
+ "token_acc": 0.834990382194519
475
+ },
476
+ {
477
+ "epoch": 0.3782051282051282,
478
+ "grad_norm": 4.828866436495037,
479
+ "learning_rate": 7.345275074118185e-07,
480
+ "loss": 0.5425753593444824,
481
+ "step": 59,
482
+ "token_acc": 0.8373525142669678
483
+ },
484
+ {
485
+ "epoch": 0.38461538461538464,
486
+ "grad_norm": 4.598613239569117,
487
+ "learning_rate": 7.251018724088366e-07,
488
+ "loss": 0.5623891353607178,
489
+ "step": 60,
490
+ "token_acc": 0.8328151702880859
491
+ },
492
+ {
493
+ "epoch": 0.391025641025641,
494
+ "grad_norm": 4.840462028626115,
495
+ "learning_rate": 7.155748137840892e-07,
496
+ "loss": 0.5742952227592468,
497
+ "step": 61,
498
+ "token_acc": 0.8290673494338989
499
+ },
500
+ {
501
+ "epoch": 0.3974358974358974,
502
+ "grad_norm": 4.23328233570671,
503
+ "learning_rate": 7.059506241219964e-07,
504
+ "loss": 0.586388885974884,
505
+ "step": 62,
506
+ "token_acc": 0.8265019059181213
507
+ },
508
+ {
509
+ "epoch": 0.40384615384615385,
510
+ "grad_norm": 4.393885373170824,
511
+ "learning_rate": 6.962336397710819e-07,
512
+ "loss": 0.5368714332580566,
513
+ "step": 63,
514
+ "token_acc": 0.8431768417358398
515
+ },
516
+ {
517
+ "epoch": 0.41025641025641024,
518
+ "grad_norm": 4.662530184420074,
519
+ "learning_rate": 6.864282388901543e-07,
520
+ "loss": 0.6058262586593628,
521
+ "step": 64,
522
+ "token_acc": 0.8214831352233887
523
+ },
524
+ {
525
+ "epoch": 0.4166666666666667,
526
+ "grad_norm": 4.558811999067238,
527
+ "learning_rate": 6.765388394756504e-07,
528
+ "loss": 0.5562135577201843,
529
+ "step": 65,
530
+ "token_acc": 0.8388559222221375
531
+ },
532
+ {
533
+ "epoch": 0.4230769230769231,
534
+ "grad_norm": 4.765395712458729,
535
+ "learning_rate": 6.665698973710288e-07,
536
+ "loss": 0.5610250234603882,
537
+ "step": 66,
538
+ "token_acc": 0.8305501937866211
539
+ },
540
+ {
541
+ "epoch": 0.42948717948717946,
542
+ "grad_norm": 4.5677446123493635,
543
+ "learning_rate": 6.565259042591111e-07,
544
+ "loss": 0.5418926477432251,
545
+ "step": 67,
546
+ "token_acc": 0.8431202173233032
547
+ },
548
+ {
549
+ "epoch": 0.4358974358974359,
550
+ "grad_norm": 4.529946815527597,
551
+ "learning_rate": 6.464113856382751e-07,
552
+ "loss": 0.5533748865127563,
553
+ "step": 68,
554
+ "token_acc": 0.8329079747200012
555
+ },
556
+ {
557
+ "epoch": 0.4423076923076923,
558
+ "grad_norm": 4.457221273065978,
559
+ "learning_rate": 6.362308987834115e-07,
560
+ "loss": 0.5477522015571594,
561
+ "step": 69,
562
+ "token_acc": 0.8378193974494934
563
+ },
564
+ {
565
+ "epoch": 0.44871794871794873,
566
+ "grad_norm": 4.611531783389826,
567
+ "learning_rate": 6.259890306925626e-07,
568
+ "loss": 0.5534911751747131,
569
+ "step": 70,
570
+ "token_acc": 0.841196596622467
571
+ },
572
+ {
573
+ "epoch": 0.4551282051282051,
574
+ "grad_norm": 4.418661988438471,
575
+ "learning_rate": 6.156903960201708e-07,
576
+ "loss": 0.5345719456672668,
577
+ "step": 71,
578
+ "token_acc": 0.8396868705749512
579
+ },
580
+ {
581
+ "epoch": 0.46153846153846156,
582
+ "grad_norm": 4.438614534821985,
583
+ "learning_rate": 6.053396349978631e-07,
584
+ "loss": 0.573357343673706,
585
+ "step": 72,
586
+ "token_acc": 0.8328611850738525
587
+ },
588
+ {
589
+ "epoch": 0.46794871794871795,
590
+ "grad_norm": 4.304860173689802,
591
+ "learning_rate": 5.949414113437141e-07,
592
+ "loss": 0.548120379447937,
593
+ "step": 73,
594
+ "token_acc": 0.8377840518951416
595
+ },
596
+ {
597
+ "epoch": 0.47435897435897434,
598
+ "grad_norm": 4.194319397177033,
599
+ "learning_rate": 5.845004101609246e-07,
600
+ "loss": 0.554309606552124,
601
+ "step": 74,
602
+ "token_acc": 0.8362563848495483
603
+ },
604
+ {
605
+ "epoch": 0.4807692307692308,
606
+ "grad_norm": 4.6298008452210615,
607
+ "learning_rate": 5.740213358268658e-07,
608
+ "loss": 0.5796681642532349,
609
+ "step": 75,
610
+ "token_acc": 0.8328510522842407
611
+ },
612
+ {
613
+ "epoch": 0.48717948717948717,
614
+ "grad_norm": 4.355472281518206,
615
+ "learning_rate": 5.635089098734393e-07,
616
+ "loss": 0.5438037514686584,
617
+ "step": 76,
618
+ "token_acc": 0.8434426188468933
619
+ },
620
+ {
621
+ "epoch": 0.4935897435897436,
622
+ "grad_norm": 4.627740617106088,
623
+ "learning_rate": 5.52967868859708e-07,
624
+ "loss": 0.5334382653236389,
625
+ "step": 77,
626
+ "token_acc": 0.8434890508651733
627
+ },
628
+ {
629
+ "epoch": 0.5,
630
+ "grad_norm": 4.2780408616986945,
631
+ "learning_rate": 5.424029622377546e-07,
632
+ "loss": 0.5313788652420044,
633
+ "step": 78,
634
+ "token_acc": 0.8449059128761292
635
+ },
636
+ {
637
+ "epoch": 0.5064102564102564,
638
+ "grad_norm": 4.092855208873091,
639
+ "learning_rate": 5.318189502127331e-07,
640
+ "loss": 0.5232336521148682,
641
+ "step": 79,
642
+ "token_acc": 0.8468860983848572
643
+ },
644
+ {
645
+ "epoch": 0.5128205128205128,
646
+ "grad_norm": 4.428993590455852,
647
+ "learning_rate": 5.212206015980741e-07,
648
+ "loss": 0.5497955083847046,
649
+ "step": 80,
650
+ "token_acc": 0.8446012735366821
651
+ },
652
+ {
653
+ "epoch": 0.5192307692307693,
654
+ "grad_norm": 4.21577617143911,
655
+ "learning_rate": 5.106126916668118e-07,
656
+ "loss": 0.546926736831665,
657
+ "step": 81,
658
+ "token_acc": 0.8366925716400146
659
+ },
660
+ {
661
+ "epoch": 0.5256410256410257,
662
+ "grad_norm": 4.438862747258348,
663
+ "learning_rate": 5e-07,
664
+ "loss": 0.5270861387252808,
665
+ "step": 82,
666
+ "token_acc": 0.8371419310569763
667
+ },
668
+ {
669
+ "epoch": 0.532051282051282,
670
+ "grad_norm": 4.176826355296084,
671
+ "learning_rate": 4.893873083331882e-07,
672
+ "loss": 0.5339565277099609,
673
+ "step": 83,
674
+ "token_acc": 0.8417102098464966
675
+ },
676
+ {
677
+ "epoch": 0.5384615384615384,
678
+ "grad_norm": 4.530198529065255,
679
+ "learning_rate": 4.787793984019259e-07,
680
+ "loss": 0.5424758195877075,
681
+ "step": 84,
682
+ "token_acc": 0.8395857810974121
683
+ },
684
+ {
685
+ "epoch": 0.5448717948717948,
686
+ "grad_norm": 4.370509876032819,
687
+ "learning_rate": 4.681810497872668e-07,
688
+ "loss": 0.5200133919715881,
689
+ "step": 85,
690
+ "token_acc": 0.8433917760848999
691
+ },
692
+ {
693
+ "epoch": 0.5512820512820513,
694
+ "grad_norm": 4.382905385719549,
695
+ "learning_rate": 4.5759703776224555e-07,
696
+ "loss": 0.5480057597160339,
697
+ "step": 86,
698
+ "token_acc": 0.8397650718688965
699
+ },
700
+ {
701
+ "epoch": 0.5576923076923077,
702
+ "grad_norm": 4.342442518129754,
703
+ "learning_rate": 4.47032131140292e-07,
704
+ "loss": 0.5228407382965088,
705
+ "step": 87,
706
+ "token_acc": 0.8539518713951111
707
+ },
708
+ {
709
+ "epoch": 0.5641025641025641,
710
+ "grad_norm": 4.137341022557355,
711
+ "learning_rate": 4.364910901265606e-07,
712
+ "loss": 0.5413128137588501,
713
+ "step": 88,
714
+ "token_acc": 0.8368589878082275
715
+ },
716
+ {
717
+ "epoch": 0.5705128205128205,
718
+ "grad_norm": 4.369749651287342,
719
+ "learning_rate": 4.259786641731343e-07,
720
+ "loss": 0.5160300731658936,
721
+ "step": 89,
722
+ "token_acc": 0.8478597402572632
723
+ },
724
+ {
725
+ "epoch": 0.5769230769230769,
726
+ "grad_norm": 4.5684142339974585,
727
+ "learning_rate": 4.154995898390755e-07,
728
+ "loss": 0.5385915637016296,
729
+ "step": 90,
730
+ "token_acc": 0.8457493185997009
731
+ },
732
+ {
733
+ "epoch": 0.5833333333333334,
734
+ "grad_norm": 4.263077787933313,
735
+ "learning_rate": 4.0505858865628575e-07,
736
+ "loss": 0.5259469747543335,
737
+ "step": 91,
738
+ "token_acc": 0.8433637022972107
739
+ },
740
+ {
741
+ "epoch": 0.5897435897435898,
742
+ "grad_norm": 4.246658460460572,
743
+ "learning_rate": 3.94660365002137e-07,
744
+ "loss": 0.5241655111312866,
745
+ "step": 92,
746
+ "token_acc": 0.8478260636329651
747
+ },
748
+ {
749
+ "epoch": 0.5961538461538461,
750
+ "grad_norm": 4.4264562426283325,
751
+ "learning_rate": 3.8430960397982926e-07,
752
+ "loss": 0.5464353561401367,
753
+ "step": 93,
754
+ "token_acc": 0.8348221778869629
755
+ },
756
+ {
757
+ "epoch": 0.6025641025641025,
758
+ "grad_norm": 4.571352573379081,
759
+ "learning_rate": 3.7401096930743746e-07,
760
+ "loss": 0.5346203446388245,
761
+ "step": 94,
762
+ "token_acc": 0.8426615595817566
763
+ },
764
+ {
765
+ "epoch": 0.6089743589743589,
766
+ "grad_norm": 4.302883692806041,
767
+ "learning_rate": 3.637691012165886e-07,
768
+ "loss": 0.4898928999900818,
769
+ "step": 95,
770
+ "token_acc": 0.8545727133750916
771
+ },
772
+ {
773
+ "epoch": 0.6153846153846154,
774
+ "grad_norm": 4.488514668810176,
775
+ "learning_rate": 3.535886143617248e-07,
776
+ "loss": 0.5597435235977173,
777
+ "step": 96,
778
+ "token_acc": 0.8423628211021423
779
+ },
780
+ {
781
+ "epoch": 0.6217948717948718,
782
+ "grad_norm": 4.5047373126179036,
783
+ "learning_rate": 3.4347409574088894e-07,
784
+ "loss": 0.5908330082893372,
785
+ "step": 97,
786
+ "token_acc": 0.831405758857727
787
+ },
788
+ {
789
+ "epoch": 0.6282051282051282,
790
+ "grad_norm": 4.326989989523886,
791
+ "learning_rate": 3.334301026289712e-07,
792
+ "loss": 0.5422452092170715,
793
+ "step": 98,
794
+ "token_acc": 0.8455585837364197
795
+ },
796
+ {
797
+ "epoch": 0.6346153846153846,
798
+ "grad_norm": 4.071817598010613,
799
+ "learning_rate": 3.234611605243496e-07,
800
+ "loss": 0.4947037994861603,
801
+ "step": 99,
802
+ "token_acc": 0.8524978756904602
803
+ },
804
+ {
805
+ "epoch": 0.6410256410256411,
806
+ "grad_norm": 4.364292580374463,
807
+ "learning_rate": 3.135717611098457e-07,
808
+ "loss": 0.4820348620414734,
809
+ "step": 100,
810
+ "token_acc": 0.8573145866394043
811
+ },
812
+ {
813
+ "epoch": 0.6474358974358975,
814
+ "grad_norm": 4.245101360198055,
815
+ "learning_rate": 3.037663602289181e-07,
816
+ "loss": 0.577752411365509,
817
+ "step": 101,
818
+ "token_acc": 0.8319244384765625
819
+ },
820
+ {
821
+ "epoch": 0.6538461538461539,
822
+ "grad_norm": 4.3455778338843,
823
+ "learning_rate": 2.940493758780037e-07,
824
+ "loss": 0.5117242336273193,
825
+ "step": 102,
826
+ "token_acc": 0.8504513502120972
827
+ },
828
+ {
829
+ "epoch": 0.6602564102564102,
830
+ "grad_norm": 4.354835545935755,
831
+ "learning_rate": 2.8442518621591084e-07,
832
+ "loss": 0.5905250310897827,
833
+ "step": 103,
834
+ "token_acc": 0.829204261302948
835
+ },
836
+ {
837
+ "epoch": 0.6666666666666666,
838
+ "grad_norm": 4.373092445004486,
839
+ "learning_rate": 2.748981275911633e-07,
840
+ "loss": 0.5381240248680115,
841
+ "step": 104,
842
+ "token_acc": 0.83758544921875
843
+ },
844
+ {
845
+ "epoch": 0.6730769230769231,
846
+ "grad_norm": 4.158187745451947,
847
+ "learning_rate": 2.6547249258818163e-07,
848
+ "loss": 0.5173760056495667,
849
+ "step": 105,
850
+ "token_acc": 0.8419111371040344
851
+ },
852
+ {
853
+ "epoch": 0.6794871794871795,
854
+ "grad_norm": 4.360270954918175,
855
+ "learning_rate": 2.561525280931828e-07,
856
+ "loss": 0.5291812419891357,
857
+ "step": 106,
858
+ "token_acc": 0.8481848239898682
859
+ },
860
+ {
861
+ "epoch": 0.6858974358974359,
862
+ "grad_norm": 4.394654818317897,
863
+ "learning_rate": 2.469424333806718e-07,
864
+ "loss": 0.5495140552520752,
865
+ "step": 107,
866
+ "token_acc": 0.8393661379814148
867
+ },
868
+ {
869
+ "epoch": 0.6923076923076923,
870
+ "grad_norm": 4.4823775203281,
871
+ "learning_rate": 2.378463582213842e-07,
872
+ "loss": 0.528519868850708,
873
+ "step": 108,
874
+ "token_acc": 0.8453878164291382
875
+ },
876
+ {
877
+ "epoch": 0.6987179487179487,
878
+ "grad_norm": 4.219004456899406,
879
+ "learning_rate": 2.2886840101253247e-07,
880
+ "loss": 0.4981050491333008,
881
+ "step": 109,
882
+ "token_acc": 0.8480120897293091
883
+ },
884
+ {
885
+ "epoch": 0.7051282051282052,
886
+ "grad_norm": 4.529812908764033,
887
+ "learning_rate": 2.2001260693120232e-07,
888
+ "loss": 0.5449967980384827,
889
+ "step": 110,
890
+ "token_acc": 0.838426411151886
891
+ },
892
+ {
893
+ "epoch": 0.7115384615384616,
894
+ "grad_norm": 4.264681112622175,
895
+ "learning_rate": 2.112829661117259e-07,
896
+ "loss": 0.5300467610359192,
897
+ "step": 111,
898
+ "token_acc": 0.8408368229866028
899
+ },
900
+ {
901
+ "epoch": 0.717948717948718,
902
+ "grad_norm": 4.224432096547358,
903
+ "learning_rate": 2.026834118478567e-07,
904
+ "loss": 0.5546152591705322,
905
+ "step": 112,
906
+ "token_acc": 0.8335220813751221
907
+ },
908
+ {
909
+ "epoch": 0.7243589743589743,
910
+ "grad_norm": 4.40275179389628,
911
+ "learning_rate": 1.9421781882055443e-07,
912
+ "loss": 0.5911360383033752,
913
+ "step": 113,
914
+ "token_acc": 0.8269292712211609
915
+ },
916
+ {
917
+ "epoch": 0.7307692307692307,
918
+ "grad_norm": 4.438604539947117,
919
+ "learning_rate": 1.858900013521788e-07,
920
+ "loss": 0.5440055131912231,
921
+ "step": 114,
922
+ "token_acc": 0.8439837098121643
923
+ },
924
+ {
925
+ "epoch": 0.7371794871794872,
926
+ "grad_norm": 4.3373601365266605,
927
+ "learning_rate": 1.777037116878804e-07,
928
+ "loss": 0.5248892307281494,
929
+ "step": 115,
930
+ "token_acc": 0.8428595662117004
931
+ },
932
+ {
933
+ "epoch": 0.7435897435897436,
934
+ "grad_norm": 3.9787786664284495,
935
+ "learning_rate": 1.6966263830495935e-07,
936
+ "loss": 0.5044262409210205,
937
+ "step": 116,
938
+ "token_acc": 0.8520953059196472
939
+ },
940
+ {
941
+ "epoch": 0.75,
942
+ "grad_norm": 4.206466011537137,
943
+ "learning_rate": 1.6177040425095663e-07,
944
+ "loss": 0.5238938331604004,
945
+ "step": 117,
946
+ "token_acc": 0.8503661751747131
947
+ },
948
+ {
949
+ "epoch": 0.7564102564102564,
950
+ "grad_norm": 4.11543543662316,
951
+ "learning_rate": 1.5403056551122694e-07,
952
+ "loss": 0.4803212583065033,
953
+ "step": 118,
954
+ "token_acc": 0.861027181148529
955
+ },
956
+ {
957
+ "epoch": 0.7628205128205128,
958
+ "grad_norm": 4.307365738144286,
959
+ "learning_rate": 1.4644660940672627e-07,
960
+ "loss": 0.5092470645904541,
961
+ "step": 119,
962
+ "token_acc": 0.847130537033081
963
+ },
964
+ {
965
+ "epoch": 0.7692307692307693,
966
+ "grad_norm": 4.228969596612662,
967
+ "learning_rate": 1.3902195302273778e-07,
968
+ "loss": 0.5285454988479614,
969
+ "step": 120,
970
+ "token_acc": 0.8402243852615356
971
+ },
972
+ {
973
+ "epoch": 0.7756410256410257,
974
+ "grad_norm": 4.317437082837098,
975
+ "learning_rate": 1.3175994166924392e-07,
976
+ "loss": 0.5160696506500244,
977
+ "step": 121,
978
+ "token_acc": 0.843143880367279
979
+ },
980
+ {
981
+ "epoch": 0.782051282051282,
982
+ "grad_norm": 4.206230433373526,
983
+ "learning_rate": 1.2466384737363779e-07,
984
+ "loss": 0.5897269248962402,
985
+ "step": 122,
986
+ "token_acc": 0.8222737908363342
987
+ },
988
+ {
989
+ "epoch": 0.7884615384615384,
990
+ "grad_norm": 4.537270673759693,
991
+ "learning_rate": 1.1773686740645383e-07,
992
+ "loss": 0.5243645906448364,
993
+ "step": 123,
994
+ "token_acc": 0.8434827327728271
995
+ },
996
+ {
997
+ "epoch": 0.7948717948717948,
998
+ "grad_norm": 4.338972097867842,
999
+ "learning_rate": 1.1098212284078035e-07,
1000
+ "loss": 0.5309593677520752,
1001
+ "step": 124,
1002
+ "token_acc": 0.8410027623176575
1003
+ },
1004
+ {
1005
+ "epoch": 0.8012820512820513,
1006
+ "grad_norm": 4.172268942442711,
1007
+ "learning_rate": 1.0440265714600571e-07,
1008
+ "loss": 0.5152863264083862,
1009
+ "step": 125,
1010
+ "token_acc": 0.8451645970344543
1011
+ },
1012
+ {
1013
+ "epoch": 0.8076923076923077,
1014
+ "grad_norm": 4.250149736675509,
1015
+ "learning_rate": 9.800143481652979e-08,
1016
+ "loss": 0.5396726131439209,
1017
+ "step": 126,
1018
+ "token_acc": 0.8436694741249084
1019
+ },
1020
+ {
1021
+ "epoch": 0.8141025641025641,
1022
+ "grad_norm": 4.405607345694319,
1023
+ "learning_rate": 9.17813400360572e-08,
1024
+ "loss": 0.5297789573669434,
1025
+ "step": 127,
1026
+ "token_acc": 0.8374651074409485
1027
+ },
1028
+ {
1029
+ "epoch": 0.8205128205128205,
1030
+ "grad_norm": 4.256107645791666,
1031
+ "learning_rate": 8.574517537807896e-08,
1032
+ "loss": 0.4773644208908081,
1033
+ "step": 128,
1034
+ "token_acc": 0.8574733138084412
1035
+ },
1036
+ {
1037
+ "epoch": 0.8269230769230769,
1038
+ "grad_norm": 4.402345485095196,
1039
+ "learning_rate": 7.989566054312286e-08,
1040
+ "loss": 0.5019025802612305,
1041
+ "step": 129,
1042
+ "token_acc": 0.8487906455993652
1043
+ },
1044
+ {
1045
+ "epoch": 0.8333333333333334,
1046
+ "grad_norm": 4.222991640512232,
1047
+ "learning_rate": 7.423543113334435e-08,
1048
+ "loss": 0.509482741355896,
1049
+ "step": 130,
1050
+ "token_acc": 0.8477755784988403
1051
+ },
1052
+ {
1053
+ "epoch": 0.8397435897435898,
1054
+ "grad_norm": 4.331972240710486,
1055
+ "learning_rate": 6.876703746500984e-08,
1056
+ "loss": 0.5533698201179504,
1057
+ "step": 131,
1058
+ "token_acc": 0.8373857736587524
1059
+ },
1060
+ {
1061
+ "epoch": 0.8461538461538461,
1062
+ "grad_norm": 4.085679298862501,
1063
+ "learning_rate": 6.349294341940592e-08,
1064
+ "loss": 0.5067212581634521,
1065
+ "step": 132,
1066
+ "token_acc": 0.8518636226654053
1067
+ },
1068
+ {
1069
+ "epoch": 0.8525641025641025,
1070
+ "grad_norm": 4.273453737642771,
1071
+ "learning_rate": 5.8415525332695334e-08,
1072
+ "loss": 0.49411821365356445,
1073
+ "step": 133,
1074
+ "token_acc": 0.8478958606719971
1075
+ },
1076
+ {
1077
+ "epoch": 0.8589743589743589,
1078
+ "grad_norm": 4.262340473425649,
1079
+ "learning_rate": 5.353707092521581e-08,
1080
+ "loss": 0.5529794692993164,
1081
+ "step": 134,
1082
+ "token_acc": 0.8371466994285583
1083
+ },
1084
+ {
1085
+ "epoch": 0.8653846153846154,
1086
+ "grad_norm": 4.382139460926817,
1087
+ "learning_rate": 4.885977827070747e-08,
1088
+ "loss": 0.509425699710846,
1089
+ "step": 135,
1090
+ "token_acc": 0.8512613773345947
1091
+ },
1092
+ {
1093
+ "epoch": 0.8717948717948718,
1094
+ "grad_norm": 4.62171216340832,
1095
+ "learning_rate": 4.4385754805932095e-08,
1096
+ "loss": 0.5211268067359924,
1097
+ "step": 136,
1098
+ "token_acc": 0.84375
1099
+ },
1100
+ {
1101
+ "epoch": 0.8782051282051282,
1102
+ "grad_norm": 4.204088741197678,
1103
+ "learning_rate": 4.0117016381130634e-08,
1104
+ "loss": 0.49088388681411743,
1105
+ "step": 137,
1106
+ "token_acc": 0.8564977645874023
1107
+ },
1108
+ {
1109
+ "epoch": 0.8846153846153846,
1110
+ "grad_norm": 4.377681174151091,
1111
+ "learning_rate": 3.6055486351745324e-08,
1112
+ "loss": 0.5041382312774658,
1113
+ "step": 138,
1114
+ "token_acc": 0.8541952967643738
1115
+ },
1116
+ {
1117
+ "epoch": 0.8910256410256411,
1118
+ "grad_norm": 4.32917767077044,
1119
+ "learning_rate": 3.220299471181898e-08,
1120
+ "loss": 0.5480432510375977,
1121
+ "step": 139,
1122
+ "token_acc": 0.8370358347892761
1123
+ },
1124
+ {
1125
+ "epoch": 0.8974358974358975,
1126
+ "grad_norm": 4.190278678307778,
1127
+ "learning_rate": 2.8561277269457895e-08,
1128
+ "loss": 0.5408469438552856,
1129
+ "step": 140,
1130
+ "token_acc": 0.8408304452896118
1131
+ },
1132
+ {
1133
+ "epoch": 0.9038461538461539,
1134
+ "grad_norm": 4.429799923194152,
1135
+ "learning_rate": 2.513197486473406e-08,
1136
+ "loss": 0.5250701904296875,
1137
+ "step": 141,
1138
+ "token_acc": 0.8404897451400757
1139
+ },
1140
+ {
1141
+ "epoch": 0.9102564102564102,
1142
+ "grad_norm": 4.240556447333352,
1143
+ "learning_rate": 2.1916632630374577e-08,
1144
+ "loss": 0.49129876494407654,
1145
+ "step": 142,
1146
+ "token_acc": 0.8528439402580261
1147
+ },
1148
+ {
1149
+ "epoch": 0.9166666666666666,
1150
+ "grad_norm": 4.094432099971454,
1151
+ "learning_rate": 1.8916699295575323e-08,
1152
+ "loss": 0.5107077360153198,
1153
+ "step": 143,
1154
+ "token_acc": 0.8480400443077087
1155
+ },
1156
+ {
1157
+ "epoch": 0.9230769230769231,
1158
+ "grad_norm": 4.096105481468722,
1159
+ "learning_rate": 1.6133526533250563e-08,
1160
+ "loss": 0.48216086626052856,
1161
+ "step": 144,
1162
+ "token_acc": 0.8517335057258606
1163
+ },
1164
+ {
1165
+ "epoch": 0.9294871794871795,
1166
+ "grad_norm": 3.999889432088483,
1167
+ "learning_rate": 1.3568368351012716e-08,
1168
+ "loss": 0.4809270203113556,
1169
+ "step": 145,
1170
+ "token_acc": 0.8589807748794556
1171
+ },
1172
+ {
1173
+ "epoch": 0.9358974358974359,
1174
+ "grad_norm": 4.312742288765351,
1175
+ "learning_rate": 1.1222380526156927e-08,
1176
+ "loss": 0.4755428731441498,
1177
+ "step": 146,
1178
+ "token_acc": 0.8620191216468811
1179
+ },
1180
+ {
1181
+ "epoch": 0.9423076923076923,
1182
+ "grad_norm": 4.535728213741053,
1183
+ "learning_rate": 9.096620084905471e-09,
1184
+ "loss": 0.5387917757034302,
1185
+ "step": 147,
1186
+ "token_acc": 0.8353335857391357
1187
+ },
1188
+ {
1189
+ "epoch": 0.9487179487179487,
1190
+ "grad_norm": 4.1679468407507905,
1191
+ "learning_rate": 7.1920448261457715e-09,
1192
+ "loss": 0.4899570345878601,
1193
+ "step": 148,
1194
+ "token_acc": 0.8593058586120605
1195
+ },
1196
+ {
1197
+ "epoch": 0.9551282051282052,
1198
+ "grad_norm": 4.2351778138083604,
1199
+ "learning_rate": 5.509512889877332e-09,
1200
+ "loss": 0.5454012155532837,
1201
+ "step": 149,
1202
+ "token_acc": 0.8417900800704956
1203
+ },
1204
+ {
1205
+ "epoch": 0.9615384615384616,
1206
+ "grad_norm": 4.296677102692333,
1207
+ "learning_rate": 4.049782370561583e-09,
1208
+ "loss": 0.523182213306427,
1209
+ "step": 150,
1210
+ "token_acc": 0.8438036441802979
1211
+ },
1212
+ {
1213
+ "epoch": 0.967948717948718,
1214
+ "grad_norm": 4.323468547344599,
1215
+ "learning_rate": 2.813510975548772e-09,
1216
+ "loss": 0.5141420364379883,
1217
+ "step": 151,
1218
+ "token_acc": 0.8459829092025757
1219
+ },
1220
+ {
1221
+ "epoch": 0.9743589743589743,
1222
+ "grad_norm": 4.286188181441807,
1223
+ "learning_rate": 1.8012557287367391e-09,
1224
+ "loss": 0.5381837487220764,
1225
+ "step": 152,
1226
+ "token_acc": 0.8385339379310608
1227
+ },
1228
+ {
1229
+ "epoch": 0.9807692307692307,
1230
+ "grad_norm": 4.395230614166292,
1231
+ "learning_rate": 1.013472719593733e-09,
1232
+ "loss": 0.5164992809295654,
1233
+ "step": 153,
1234
+ "token_acc": 0.848739504814148
1235
+ },
1236
+ {
1237
+ "epoch": 0.9871794871794872,
1238
+ "grad_norm": 4.382167968792246,
1239
+ "learning_rate": 4.5051689765929213e-10,
1240
+ "loss": 0.5406173467636108,
1241
+ "step": 154,
1242
+ "token_acc": 0.8395528793334961
1243
+ },
1244
+ {
1245
+ "epoch": 0.9935897435897436,
1246
+ "grad_norm": 4.218184305013247,
1247
+ "learning_rate": 1.1264191261528555e-10,
1248
+ "loss": 0.5075889229774475,
1249
+ "step": 155,
1250
+ "token_acc": 0.8444151282310486
1251
+ },
1252
+ {
1253
+ "epoch": 1.0,
1254
+ "grad_norm": 4.958298958687034,
1255
+ "learning_rate": 0.0,
1256
+ "loss": 0.5462541580200195,
1257
+ "step": 156,
1258
+ "token_acc": 0.8386925458908081
1259
+ },
1260
+ {
1261
+ "epoch": 1.0,
1262
+ "eval_loss": 0.5514746904373169,
1263
+ "eval_runtime": 10.0275,
1264
+ "eval_samples_per_second": 0.898,
1265
+ "eval_steps_per_second": 0.199,
1266
+ "eval_token_acc": 0.8481510877609253,
1267
+ "step": 156
1268
+ },
1269
+ {
1270
+ "epoch": 1.0,
1271
+ "eval_loss": 0.5514746904373169,
1272
+ "eval_runtime": 12.1526,
1273
+ "eval_samples_per_second": 0.741,
1274
+ "eval_steps_per_second": 0.165,
1275
+ "eval_token_acc": 0.8481510877609253,
1276
+ "step": 156
1277
+ }
1278
+ ],
1279
+ "logging_steps": 1,
1280
+ "max_steps": 156,
1281
+ "num_input_tokens_seen": 0,
1282
+ "num_train_epochs": 1,
1283
+ "save_steps": 500,
1284
+ "stateful_callbacks": {
1285
+ "TrainerControl": {
1286
+ "args": {
1287
+ "should_epoch_stop": false,
1288
+ "should_evaluate": false,
1289
+ "should_log": false,
1290
+ "should_save": true,
1291
+ "should_training_stop": true
1292
+ },
1293
+ "attributes": {}
1294
+ }
1295
+ },
1296
+ "total_flos": 69981770547200.0,
1297
+ "train_batch_size": 4,
1298
+ "trial_name": null,
1299
+ "trial_params": null
1300
+ }
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72d9d5a75a35915924189be0b5520b4e1ac2518219811a411df1d2f6a24029b5
3
+ size 9041
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_loss.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_runtime.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_samples_per_second.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_steps_per_second.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_token_acc.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_epoch.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_grad_norm.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_learning_rate.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_loss.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_token_acc.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_total_flos.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_loss.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_runtime.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_samples_per_second.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_steps_per_second.png ADDED
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/logging.jsonl ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"loss": 1.33325613, "grad_norm": 204.55272732, "learning_rate": 1.2e-07, "token_acc": 0.70947713, "epoch": 0.00641026, "global_step/max_steps": "1/156", "percentage": "0.64%", "elapsed_time": "1m 33s", "remaining_time": "4h 0m 47s", "memory(GiB)": 24.96, "train_speed(iter/s)": 0.010728}
2
+ {"loss": 1.29191422, "grad_norm": 472.01805133, "learning_rate": 2.5e-07, "token_acc": 0.71543646, "epoch": 0.01282051, "global_step/max_steps": "2/156", "percentage": "1.28%", "elapsed_time": "3m 5s", "remaining_time": "3h 58m 12s", "memory(GiB)": 37.77, "train_speed(iter/s)": 0.010775}
3
+ {"loss": 1.3103106, "grad_norm": 295.90115389, "learning_rate": 3.8e-07, "token_acc": 0.71264368, "epoch": 0.01923077, "global_step/max_steps": "3/156", "percentage": "1.92%", "elapsed_time": "4m 30s", "remaining_time": "3h 50m 14s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011076}
4
+ {"loss": 1.35691023, "grad_norm": 420.50233874, "learning_rate": 5e-07, "token_acc": 0.70171893, "epoch": 0.02564103, "global_step/max_steps": "4/156", "percentage": "2.56%", "elapsed_time": "5m 55s", "remaining_time": "3h 45m 0s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011258}
5
+ {"loss": 1.34290111, "grad_norm": 532.48835195, "learning_rate": 6.2e-07, "token_acc": 0.70352352, "epoch": 0.03205128, "global_step/max_steps": "5/156", "percentage": "3.21%", "elapsed_time": "7m 13s", "remaining_time": "3h 38m 20s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011526}
6
+ {"loss": 1.34328222, "grad_norm": 745.15335693, "learning_rate": 7.5e-07, "token_acc": 0.70404929, "epoch": 0.03846154, "global_step/max_steps": "6/156", "percentage": "3.85%", "elapsed_time": "8m 24s", "remaining_time": "3h 30m 9s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011896}
7
+ {"loss": 1.23723578, "grad_norm": 316.95973789, "learning_rate": 8.7e-07, "token_acc": 0.71942562, "epoch": 0.04487179, "global_step/max_steps": "7/156", "percentage": "4.49%", "elapsed_time": "9m 51s", "remaining_time": "3h 29m 49s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011835}
8
+ {"loss": 1.1821543, "grad_norm": 80.89470278, "learning_rate": 1e-06, "token_acc": 0.70987546, "epoch": 0.05128205, "global_step/max_steps": "8/156", "percentage": "5.13%", "elapsed_time": "11m 15s", "remaining_time": "3h 28m 25s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011835}
9
+ {"loss": 1.19064462, "grad_norm": 194.43529651, "learning_rate": 1e-06, "token_acc": 0.70667911, "epoch": 0.05769231, "global_step/max_steps": "9/156", "percentage": "5.77%", "elapsed_time": "12m 47s", "remaining_time": "3h 28m 51s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.01173}
10
+ {"loss": 1.14480925, "grad_norm": 99.39471689, "learning_rate": 1e-06, "token_acc": 0.71327931, "epoch": 0.06410256, "global_step/max_steps": "10/156", "percentage": "6.41%", "elapsed_time": "14m 24s", "remaining_time": "3h 30m 27s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011562}
11
+ {"loss": 1.08528066, "grad_norm": 84.23475207, "learning_rate": 1e-06, "token_acc": 0.7265625, "epoch": 0.07051282, "global_step/max_steps": "11/156", "percentage": "7.05%", "elapsed_time": "15m 39s", "remaining_time": "3h 26m 27s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011705}
12
+ {"loss": 1.03270876, "grad_norm": 85.3562937, "learning_rate": 1e-06, "token_acc": 0.73598248, "epoch": 0.07692308, "global_step/max_steps": "12/156", "percentage": "7.69%", "elapsed_time": "16m 58s", "remaining_time": "3h 23m 46s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011778}
13
+ {"loss": 1.01876926, "grad_norm": 29.61142573, "learning_rate": 1e-06, "token_acc": 0.73305017, "epoch": 0.08333333, "global_step/max_steps": "13/156", "percentage": "8.33%", "elapsed_time": "18m 23s", "remaining_time": "3h 22m 15s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011784}
14
+ {"loss": 1.03286433, "grad_norm": 53.80220393, "learning_rate": 1e-06, "token_acc": 0.73363143, "epoch": 0.08974359, "global_step/max_steps": "14/156", "percentage": "8.97%", "elapsed_time": "19m 52s", "remaining_time": "3h 21m 33s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011742}
15
+ {"loss": 0.93531168, "grad_norm": 26.63791402, "learning_rate": 9.9e-07, "token_acc": 0.75495529, "epoch": 0.09615385, "global_step/max_steps": "15/156", "percentage": "9.62%", "elapsed_time": "21m 14s", "remaining_time": "3h 19m 44s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011765}
16
+ {"loss": 0.89379692, "grad_norm": 241.87630365, "learning_rate": 9.9e-07, "token_acc": 0.76340747, "epoch": 0.1025641, "global_step/max_steps": "16/156", "percentage": "10.26%", "elapsed_time": "22m 17s", "remaining_time": "3h 15m 6s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011959}
17
+ {"loss": 0.90603113, "grad_norm": 260.04513555, "learning_rate": 9.9e-07, "token_acc": 0.76787245, "epoch": 0.10897436, "global_step/max_steps": "17/156", "percentage": "10.90%", "elapsed_time": "23m 29s", "remaining_time": "3h 12m 8s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012057}
18
+ {"loss": 0.88351417, "grad_norm": 13.61011308, "learning_rate": 9.9e-07, "token_acc": 0.76899034, "epoch": 0.11538462, "global_step/max_steps": "18/156", "percentage": "11.54%", "elapsed_time": "24m 54s", "remaining_time": "3h 10m 58s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012044}
19
+ {"loss": 0.86171758, "grad_norm": 13.22296825, "learning_rate": 9.9e-07, "token_acc": 0.77243066, "epoch": 0.12179487, "global_step/max_steps": "19/156", "percentage": "12.18%", "elapsed_time": "26m 21s", "remaining_time": "3h 10m 4s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012012}
20
+ {"loss": 0.83477306, "grad_norm": 10.53957112, "learning_rate": 9.8e-07, "token_acc": 0.7823472, "epoch": 0.12820513, "global_step/max_steps": "20/156", "percentage": "12.82%", "elapsed_time": "28m 13s", "remaining_time": "3h 11m 57s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011808}
21
+ {"loss": 0.87326491, "grad_norm": 10.15724493, "learning_rate": 9.8e-07, "token_acc": 0.76877952, "epoch": 0.13461538, "global_step/max_steps": "21/156", "percentage": "13.46%", "elapsed_time": "29m 44s", "remaining_time": "3h 11m 10s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011769}
22
+ {"loss": 0.853962, "grad_norm": 8.89566429, "learning_rate": 9.8e-07, "token_acc": 0.77166915, "epoch": 0.14102564, "global_step/max_steps": "22/156", "percentage": "14.10%", "elapsed_time": "30m 52s", "remaining_time": "3h 8m 2s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011877}
23
+ {"loss": 0.81837565, "grad_norm": 8.04658391, "learning_rate": 9.7e-07, "token_acc": 0.77758682, "epoch": 0.1474359, "global_step/max_steps": "23/156", "percentage": "14.74%", "elapsed_time": "32m 8s", "remaining_time": "3h 5m 49s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011929}
24
+ {"loss": 0.74004531, "grad_norm": 6.40986945, "learning_rate": 9.7e-07, "token_acc": 0.79958498, "epoch": 0.15384615, "global_step/max_steps": "24/156", "percentage": "15.38%", "elapsed_time": "33m 29s", "remaining_time": "3h 4m 10s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011945}
25
+ {"loss": 0.75977117, "grad_norm": 6.14634333, "learning_rate": 9.7e-07, "token_acc": 0.78765434, "epoch": 0.16025641, "global_step/max_steps": "25/156", "percentage": "16.03%", "elapsed_time": "34m 45s", "remaining_time": "3h 2m 6s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011989}
26
+ {"loss": 0.76574373, "grad_norm": 6.46083555, "learning_rate": 9.6e-07, "token_acc": 0.78669, "epoch": 0.16666667, "global_step/max_steps": "26/156", "percentage": "16.67%", "elapsed_time": "35m 53s", "remaining_time": "2h 59m 29s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012071}
27
+ {"loss": 0.7088508, "grad_norm": 5.9896133, "learning_rate": 9.6e-07, "token_acc": 0.8035453, "epoch": 0.17307692, "global_step/max_steps": "27/156", "percentage": "17.31%", "elapsed_time": "37m 10s", "remaining_time": "2h 57m 38s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012103}
28
+ {"loss": 0.69491911, "grad_norm": 6.86287787, "learning_rate": 9.6e-07, "token_acc": 0.80570823, "epoch": 0.17948718, "global_step/max_steps": "28/156", "percentage": "17.95%", "elapsed_time": "38m 33s", "remaining_time": "2h 56m 16s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012102}
29
+ {"loss": 0.68353021, "grad_norm": 5.45499932, "learning_rate": 9.5e-07, "token_acc": 0.80842853, "epoch": 0.18589744, "global_step/max_steps": "29/156", "percentage": "18.59%", "elapsed_time": "40m 4s", "remaining_time": "2h 55m 29s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012062}
30
+ {"loss": 0.65994573, "grad_norm": 5.46251925, "learning_rate": 9.5e-07, "token_acc": 0.81316453, "epoch": 0.19230769, "global_step/max_steps": "30/156", "percentage": "19.23%", "elapsed_time": "41m 18s", "remaining_time": "2h 53m 30s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012104}
31
+ {"loss": 0.68330657, "grad_norm": 5.90894386, "learning_rate": 9.4e-07, "token_acc": 0.81380141, "epoch": 0.19871795, "global_step/max_steps": "31/156", "percentage": "19.87%", "elapsed_time": "42m 29s", "remaining_time": "2h 51m 19s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012161}
32
+ {"loss": 0.68613803, "grad_norm": 5.28856744, "learning_rate": 9.4e-07, "token_acc": 0.80556458, "epoch": 0.20512821, "global_step/max_steps": "32/156", "percentage": "20.51%", "elapsed_time": "43m 36s", "remaining_time": "2h 48m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01223}
33
+ {"loss": 0.63391989, "grad_norm": 5.35706529, "learning_rate": 9.3e-07, "token_acc": 0.81883174, "epoch": 0.21153846, "global_step/max_steps": "33/156", "percentage": "21.15%", "elapsed_time": "44m 55s", "remaining_time": "2h 47m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012241}
34
+ {"loss": 0.6426698, "grad_norm": 5.16060255, "learning_rate": 9.3e-07, "token_acc": 0.81714875, "epoch": 0.21794872, "global_step/max_steps": "34/156", "percentage": "21.79%", "elapsed_time": "46m 13s", "remaining_time": "2h 45m 51s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01226}
35
+ {"loss": 0.6970824, "grad_norm": 4.8647035, "learning_rate": 9.2e-07, "token_acc": 0.80546129, "epoch": 0.22435897, "global_step/max_steps": "35/156", "percentage": "22.44%", "elapsed_time": "47m 25s", "remaining_time": "2h 43m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012301}
36
+ {"loss": 0.63011646, "grad_norm": 5.20984227, "learning_rate": 9.1e-07, "token_acc": 0.81981981, "epoch": 0.23076923, "global_step/max_steps": "36/156", "percentage": "23.08%", "elapsed_time": "48m 54s", "remaining_time": "2h 43m 2s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
37
+ {"loss": 0.64742309, "grad_norm": 5.06085511, "learning_rate": 9.1e-07, "token_acc": 0.8159399, "epoch": 0.23717949, "global_step/max_steps": "37/156", "percentage": "23.72%", "elapsed_time": "50m 11s", "remaining_time": "2h 41m 25s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012287}
38
+ {"loss": 0.69717383, "grad_norm": 4.61811775, "learning_rate": 9e-07, "token_acc": 0.80520284, "epoch": 0.24358974, "global_step/max_steps": "38/156", "percentage": "24.36%", "elapsed_time": "51m 43s", "remaining_time": "2h 40m 37s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012243}
39
+ {"loss": 0.62579024, "grad_norm": 4.66712972, "learning_rate": 9e-07, "token_acc": 0.81843996, "epoch": 0.25, "global_step/max_steps": "39/156", "percentage": "25.00%", "elapsed_time": "53m 3s", "remaining_time": "2h 39m 10s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012251}
40
+ {"loss": 0.65105742, "grad_norm": 4.75867456, "learning_rate": 8.9e-07, "token_acc": 0.81644934, "epoch": 0.25641026, "global_step/max_steps": "40/156", "percentage": "25.64%", "elapsed_time": "54m 22s", "remaining_time": "2h 37m 40s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012261}
41
+ {"loss": 0.63637537, "grad_norm": 4.88613943, "learning_rate": 8.8e-07, "token_acc": 0.81838524, "epoch": 0.26282051, "global_step/max_steps": "41/156", "percentage": "26.28%", "elapsed_time": "55m 40s", "remaining_time": "2h 36m 8s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012275}
42
+ {"loss": 0.65643597, "grad_norm": 4.73657968, "learning_rate": 8.8e-07, "token_acc": 0.81969416, "epoch": 0.26923077, "global_step/max_steps": "42/156", "percentage": "26.92%", "elapsed_time": "56m 50s", "remaining_time": "2h 34m 18s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012313}
43
+ {"loss": 0.63899696, "grad_norm": 4.55758133, "learning_rate": 8.7e-07, "token_acc": 0.81999677, "epoch": 0.27564103, "global_step/max_steps": "43/156", "percentage": "27.56%", "elapsed_time": "58m 12s", "remaining_time": "2h 32m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012314}
44
+ {"loss": 0.63628721, "grad_norm": 4.99415384, "learning_rate": 8.6e-07, "token_acc": 0.8194927, "epoch": 0.28205128, "global_step/max_steps": "44/156", "percentage": "28.21%", "elapsed_time": "59m 35s", "remaining_time": "2h 31m 40s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012306}
45
+ {"loss": 0.63944864, "grad_norm": 4.54901661, "learning_rate": 8.5e-07, "token_acc": 0.81742072, "epoch": 0.28846154, "global_step/max_steps": "45/156", "percentage": "28.85%", "elapsed_time": "1h 1m 0s", "remaining_time": "2h 30m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012295}
46
+ {"loss": 0.61701751, "grad_norm": 4.76973775, "learning_rate": 8.5e-07, "token_acc": 0.82378399, "epoch": 0.29487179, "global_step/max_steps": "46/156", "percentage": "29.49%", "elapsed_time": "1h 2m 17s", "remaining_time": "2h 28m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012309}
47
+ {"loss": 0.59330505, "grad_norm": 4.68095775, "learning_rate": 8.4e-07, "token_acc": 0.82901824, "epoch": 0.30128205, "global_step/max_steps": "47/156", "percentage": "30.13%", "elapsed_time": "1h 3m 28s", "remaining_time": "2h 27m 11s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012342}
48
+ {"loss": 0.60955942, "grad_norm": 4.52158153, "learning_rate": 8.3e-07, "token_acc": 0.82506776, "epoch": 0.30769231, "global_step/max_steps": "48/156", "percentage": "30.77%", "elapsed_time": "1h 4m 39s", "remaining_time": "2h 25m 29s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012372}
49
+ {"loss": 0.58634937, "grad_norm": 4.81313193, "learning_rate": 8.2e-07, "token_acc": 0.83429396, "epoch": 0.31410256, "global_step/max_steps": "49/156", "percentage": "31.41%", "elapsed_time": "1h 6m 2s", "remaining_time": "2h 24m 12s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012366}
50
+ {"loss": 0.62977922, "grad_norm": 4.87404017, "learning_rate": 8.1e-07, "token_acc": 0.8187986, "epoch": 0.32051282, "global_step/max_steps": "50/156", "percentage": "32.05%", "elapsed_time": "1h 7m 14s", "remaining_time": "2h 22m 32s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012395}
51
+ {"loss": 0.59493053, "grad_norm": 4.94227702, "learning_rate": 8.1e-07, "token_acc": 0.83116883, "epoch": 0.32692308, "global_step/max_steps": "51/156", "percentage": "32.69%", "elapsed_time": "1h 8m 33s", "remaining_time": "2h 21m 8s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012399}
52
+ {"loss": 0.57395041, "grad_norm": 4.39539597, "learning_rate": 8e-07, "token_acc": 0.83097446, "epoch": 0.33333333, "global_step/max_steps": "52/156", "percentage": "33.33%", "elapsed_time": "1h 10m 0s", "remaining_time": "2h 20m 0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01238}
53
+ {"loss": 0.61727303, "grad_norm": 4.73591311, "learning_rate": 7.9e-07, "token_acc": 0.82129598, "epoch": 0.33974359, "global_step/max_steps": "53/156", "percentage": "33.97%", "elapsed_time": "1h 11m 36s", "remaining_time": "2h 19m 9s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012335}
54
+ {"loss": 0.55737722, "grad_norm": 4.37825661, "learning_rate": 7.8e-07, "token_acc": 0.84513646, "epoch": 0.34615385, "global_step/max_steps": "54/156", "percentage": "34.62%", "elapsed_time": "1h 12m 46s", "remaining_time": "2h 17m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012366}
55
+ {"loss": 0.57170796, "grad_norm": 4.68421898, "learning_rate": 7.7e-07, "token_acc": 0.83828151, "epoch": 0.3525641, "global_step/max_steps": "55/156", "percentage": "35.26%", "elapsed_time": "1h 14m 1s", "remaining_time": "2h 15m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012383}
56
+ {"loss": 0.59979248, "grad_norm": 4.42801189, "learning_rate": 7.6e-07, "token_acc": 0.8215729, "epoch": 0.35897436, "global_step/max_steps": "56/156", "percentage": "35.90%", "elapsed_time": "1h 15m 27s", "remaining_time": "2h 14m 45s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012369}
57
+ {"loss": 0.55416632, "grad_norm": 4.82672986, "learning_rate": 7.5e-07, "token_acc": 0.8405695, "epoch": 0.36538462, "global_step/max_steps": "57/156", "percentage": "36.54%", "elapsed_time": "1h 16m 41s", "remaining_time": "2h 13m 11s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
58
+ {"loss": 0.57087392, "grad_norm": 4.35926386, "learning_rate": 7.4e-07, "token_acc": 0.83499038, "epoch": 0.37179487, "global_step/max_steps": "58/156", "percentage": "37.18%", "elapsed_time": "1h 18m 10s", "remaining_time": "2h 12m 5s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012365}
59
+ {"loss": 0.54257536, "grad_norm": 4.82886644, "learning_rate": 7.3e-07, "token_acc": 0.83735251, "epoch": 0.37820513, "global_step/max_steps": "59/156", "percentage": "37.82%", "elapsed_time": "1h 19m 24s", "remaining_time": "2h 10m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012382}
60
+ {"loss": 0.56238914, "grad_norm": 4.59861324, "learning_rate": 7.3e-07, "token_acc": 0.83281517, "epoch": 0.38461538, "global_step/max_steps": "60/156", "percentage": "38.46%", "elapsed_time": "1h 20m 50s", "remaining_time": "2h 9m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01237}
61
+ {"loss": 0.57429522, "grad_norm": 4.84046203, "learning_rate": 7.2e-07, "token_acc": 0.82906735, "epoch": 0.39102564, "global_step/max_steps": "61/156", "percentage": "39.10%", "elapsed_time": "1h 22m 14s", "remaining_time": "2h 8m 4s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012362}
62
+ {"loss": 0.58638889, "grad_norm": 4.23328234, "learning_rate": 7.1e-07, "token_acc": 0.82650191, "epoch": 0.3974359, "global_step/max_steps": "62/156", "percentage": "39.74%", "elapsed_time": "1h 23m 45s", "remaining_time": "2h 6m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012336}
63
+ {"loss": 0.53687143, "grad_norm": 4.39388537, "learning_rate": 7e-07, "token_acc": 0.84317684, "epoch": 0.40384615, "global_step/max_steps": "63/156", "percentage": "40.38%", "elapsed_time": "1h 25m 19s", "remaining_time": "2h 5m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012307}
64
+ {"loss": 0.60582626, "grad_norm": 4.66253018, "learning_rate": 6.9e-07, "token_acc": 0.82148314, "epoch": 0.41025641, "global_step/max_steps": "64/156", "percentage": "41.03%", "elapsed_time": "1h 26m 44s", "remaining_time": "2h 4m 41s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012297}
65
+ {"loss": 0.55621356, "grad_norm": 4.558812, "learning_rate": 6.8e-07, "token_acc": 0.83885592, "epoch": 0.41666667, "global_step/max_steps": "65/156", "percentage": "41.67%", "elapsed_time": "1h 28m 15s", "remaining_time": "2h 3m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012275}
66
+ {"loss": 0.56102502, "grad_norm": 4.76539571, "learning_rate": 6.7e-07, "token_acc": 0.83055019, "epoch": 0.42307692, "global_step/max_steps": "66/156", "percentage": "42.31%", "elapsed_time": "1h 29m 28s", "remaining_time": "2h 2m 1s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012293}
67
+ {"loss": 0.54189265, "grad_norm": 4.56774461, "learning_rate": 6.6e-07, "token_acc": 0.84312022, "epoch": 0.42948718, "global_step/max_steps": "67/156", "percentage": "42.95%", "elapsed_time": "1h 30m 56s", "remaining_time": "2h 0m 48s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012278}
68
+ {"loss": 0.55337489, "grad_norm": 4.52994682, "learning_rate": 6.5e-07, "token_acc": 0.83290797, "epoch": 0.43589744, "global_step/max_steps": "68/156", "percentage": "43.59%", "elapsed_time": "1h 32m 17s", "remaining_time": "1h 59m 25s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01228}
69
+ {"loss": 0.5477522, "grad_norm": 4.45722127, "learning_rate": 6.4e-07, "token_acc": 0.8378194, "epoch": 0.44230769, "global_step/max_steps": "69/156", "percentage": "44.23%", "elapsed_time": "1h 33m 57s", "remaining_time": "1h 58m 27s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01224}
70
+ {"loss": 0.55349118, "grad_norm": 4.61153178, "learning_rate": 6.3e-07, "token_acc": 0.8411966, "epoch": 0.44871795, "global_step/max_steps": "70/156", "percentage": "44.87%", "elapsed_time": "1h 35m 20s", "remaining_time": "1h 57m 7s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012237}
71
+ {"loss": 0.53457195, "grad_norm": 4.41866199, "learning_rate": 6.2e-07, "token_acc": 0.83968687, "epoch": 0.45512821, "global_step/max_steps": "71/156", "percentage": "45.51%", "elapsed_time": "1h 36m 36s", "remaining_time": "1h 55m 39s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012248}
72
+ {"loss": 0.57335734, "grad_norm": 4.43861453, "learning_rate": 6.1e-07, "token_acc": 0.83286119, "epoch": 0.46153846, "global_step/max_steps": "72/156", "percentage": "46.15%", "elapsed_time": "1h 37m 51s", "remaining_time": "1h 54m 10s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012262}
73
+ {"loss": 0.54812038, "grad_norm": 4.30486017, "learning_rate": 5.9e-07, "token_acc": 0.83778405, "epoch": 0.46794872, "global_step/max_steps": "73/156", "percentage": "46.79%", "elapsed_time": "1h 39m 10s", "remaining_time": "1h 52m 46s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
74
+ {"loss": 0.55430961, "grad_norm": 4.1943194, "learning_rate": 5.8e-07, "token_acc": 0.83625638, "epoch": 0.47435897, "global_step/max_steps": "74/156", "percentage": "47.44%", "elapsed_time": "1h 40m 28s", "remaining_time": "1h 51m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012275}
75
+ {"loss": 0.57966816, "grad_norm": 4.62980085, "learning_rate": 5.7e-07, "token_acc": 0.83285105, "epoch": 0.48076923, "global_step/max_steps": "75/156", "percentage": "48.08%", "elapsed_time": "1h 41m 40s", "remaining_time": "1h 49m 48s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012294}
76
+ {"loss": 0.54380375, "grad_norm": 4.35547228, "learning_rate": 5.6e-07, "token_acc": 0.84344262, "epoch": 0.48717949, "global_step/max_steps": "76/156", "percentage": "48.72%", "elapsed_time": "1h 42m 50s", "remaining_time": "1h 48m 15s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012317}
77
+ {"loss": 0.53343827, "grad_norm": 4.62774062, "learning_rate": 5.5e-07, "token_acc": 0.84348905, "epoch": 0.49358974, "global_step/max_steps": "77/156", "percentage": "49.36%", "elapsed_time": "1h 44m 42s", "remaining_time": "1h 47m 26s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012256}
78
+ {"loss": 0.53137887, "grad_norm": 4.27804086, "learning_rate": 5.4e-07, "token_acc": 0.84490591, "epoch": 0.5, "global_step/max_steps": "78/156", "percentage": "50.00%", "elapsed_time": "1h 45m 58s", "remaining_time": "1h 45m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
79
+ {"loss": 0.52323365, "grad_norm": 4.09285521, "learning_rate": 5.3e-07, "token_acc": 0.8468861, "epoch": 0.50641026, "global_step/max_steps": "79/156", "percentage": "50.64%", "elapsed_time": "1h 47m 18s", "remaining_time": "1h 44m 35s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012269}
80
+ {"loss": 0.54979551, "grad_norm": 4.42899359, "learning_rate": 5.2e-07, "token_acc": 0.84460127, "epoch": 0.51282051, "global_step/max_steps": "80/156", "percentage": "51.28%", "elapsed_time": "1h 48m 49s", "remaining_time": "1h 43m 23s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012252}
81
+ {"loss": 0.54692674, "grad_norm": 4.21577617, "learning_rate": 5.1e-07, "token_acc": 0.83669257, "epoch": 0.51923077, "global_step/max_steps": "81/156", "percentage": "51.92%", "elapsed_time": "1h 50m 3s", "remaining_time": "1h 41m 54s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
82
+ {"loss": 0.52708614, "grad_norm": 4.43886275, "learning_rate": 5e-07, "token_acc": 0.83714193, "epoch": 0.52564103, "global_step/max_steps": "82/156", "percentage": "52.56%", "elapsed_time": "1h 51m 11s", "remaining_time": "1h 40m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012292}
83
+ {"loss": 0.53395653, "grad_norm": 4.17682636, "learning_rate": 4.9e-07, "token_acc": 0.84171021, "epoch": 0.53205128, "global_step/max_steps": "83/156", "percentage": "53.21%", "elapsed_time": "1h 52m 30s", "remaining_time": "1h 38m 57s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012295}
84
+ {"loss": 0.54247582, "grad_norm": 4.53019853, "learning_rate": 4.8e-07, "token_acc": 0.83958578, "epoch": 0.53846154, "global_step/max_steps": "84/156", "percentage": "53.85%", "elapsed_time": "1h 53m 51s", "remaining_time": "1h 37m 35s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012296}
85
+ {"loss": 0.52001339, "grad_norm": 4.37050988, "learning_rate": 4.7e-07, "token_acc": 0.84339178, "epoch": 0.54487179, "global_step/max_steps": "85/156", "percentage": "54.49%", "elapsed_time": "1h 55m 46s", "remaining_time": "1h 36m 42s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012236}
86
+ {"loss": 0.54800576, "grad_norm": 4.38290539, "learning_rate": 4.6e-07, "token_acc": 0.83976507, "epoch": 0.55128205, "global_step/max_steps": "86/156", "percentage": "55.13%", "elapsed_time": "1h 57m 13s", "remaining_time": "1h 35m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012227}
87
+ {"loss": 0.52284074, "grad_norm": 4.34244252, "learning_rate": 4.5e-07, "token_acc": 0.85395187, "epoch": 0.55769231, "global_step/max_steps": "87/156", "percentage": "55.77%", "elapsed_time": "1h 58m 37s", "remaining_time": "1h 34m 4s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012223}
88
+ {"loss": 0.54131281, "grad_norm": 4.13734102, "learning_rate": 4.4e-07, "token_acc": 0.83685899, "epoch": 0.56410256, "global_step/max_steps": "88/156", "percentage": "56.41%", "elapsed_time": "1h 59m 48s", "remaining_time": "1h 32m 34s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012242}
89
+ {"loss": 0.51603007, "grad_norm": 4.36974965, "learning_rate": 4.3e-07, "token_acc": 0.84785974, "epoch": 0.57051282, "global_step/max_steps": "89/156", "percentage": "57.05%", "elapsed_time": "2h 1m 17s", "remaining_time": "1h 31m 18s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01223}
90
+ {"loss": 0.53859156, "grad_norm": 4.56841423, "learning_rate": 4.2e-07, "token_acc": 0.84574932, "epoch": 0.57692308, "global_step/max_steps": "90/156", "percentage": "57.69%", "elapsed_time": "2h 2m 41s", "remaining_time": "1h 29m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012225}
91
+ {"loss": 0.52594697, "grad_norm": 4.26307779, "learning_rate": 4.1e-07, "token_acc": 0.8433637, "epoch": 0.58333333, "global_step/max_steps": "91/156", "percentage": "58.33%", "elapsed_time": "2h 3m 45s", "remaining_time": "1h 28m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012255}
92
+ {"loss": 0.52416551, "grad_norm": 4.24665846, "learning_rate": 3.9e-07, "token_acc": 0.84782606, "epoch": 0.58974359, "global_step/max_steps": "92/156", "percentage": "58.97%", "elapsed_time": "2h 5m 11s", "remaining_time": "1h 27m 5s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012247}
93
+ {"loss": 0.54643536, "grad_norm": 4.42645624, "learning_rate": 3.8e-07, "token_acc": 0.83482218, "epoch": 0.59615385, "global_step/max_steps": "93/156", "percentage": "59.62%", "elapsed_time": "2h 6m 45s", "remaining_time": "1h 25m 52s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012228}
94
+ {"loss": 0.53462034, "grad_norm": 4.57135257, "learning_rate": 3.7e-07, "token_acc": 0.84266156, "epoch": 0.6025641, "global_step/max_steps": "94/156", "percentage": "60.26%", "elapsed_time": "2h 7m 59s", "remaining_time": "1h 24m 25s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01224}
95
+ {"loss": 0.4898929, "grad_norm": 4.30288369, "learning_rate": 3.6e-07, "token_acc": 0.85457271, "epoch": 0.60897436, "global_step/max_steps": "95/156", "percentage": "60.90%", "elapsed_time": "2h 9m 12s", "remaining_time": "1h 22m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012253}
96
+ {"loss": 0.55974352, "grad_norm": 4.48851467, "learning_rate": 3.5e-07, "token_acc": 0.84236282, "epoch": 0.61538462, "global_step/max_steps": "96/156", "percentage": "61.54%", "elapsed_time": "2h 10m 30s", "remaining_time": "1h 21m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01226}
97
+ {"loss": 0.59083301, "grad_norm": 4.50473731, "learning_rate": 3.4e-07, "token_acc": 0.83140576, "epoch": 0.62179487, "global_step/max_steps": "97/156", "percentage": "62.18%", "elapsed_time": "2h 11m 52s", "remaining_time": "1h 20m 12s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012259}
98
+ {"loss": 0.54224521, "grad_norm": 4.32698999, "learning_rate": 3.3e-07, "token_acc": 0.84555858, "epoch": 0.62820513, "global_step/max_steps": "98/156", "percentage": "62.82%", "elapsed_time": "2h 13m 7s", "remaining_time": "1h 18m 47s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012269}
99
+ {"loss": 0.4947038, "grad_norm": 4.0718176, "learning_rate": 3.2e-07, "token_acc": 0.85249788, "epoch": 0.63461538, "global_step/max_steps": "99/156", "percentage": "63.46%", "elapsed_time": "2h 14m 18s", "remaining_time": "1h 17m 19s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012285}
100
+ {"loss": 0.48203486, "grad_norm": 4.36429258, "learning_rate": 3.1e-07, "token_acc": 0.85731459, "epoch": 0.64102564, "global_step/max_steps": "100/156", "percentage": "64.10%", "elapsed_time": "2h 15m 42s", "remaining_time": "1h 15m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012281}
101
+ {"loss": 0.57775241, "grad_norm": 4.24510136, "learning_rate": 3e-07, "token_acc": 0.83192444, "epoch": 0.6474359, "global_step/max_steps": "101/156", "percentage": "64.74%", "elapsed_time": "2h 16m 51s", "remaining_time": "1h 14m 31s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012299}
102
+ {"loss": 0.51172423, "grad_norm": 4.34557783, "learning_rate": 2.9e-07, "token_acc": 0.85045135, "epoch": 0.65384615, "global_step/max_steps": "102/156", "percentage": "65.38%", "elapsed_time": "2h 18m 14s", "remaining_time": "1h 13m 11s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012298}
103
+ {"loss": 0.59052503, "grad_norm": 4.35483555, "learning_rate": 2.8e-07, "token_acc": 0.82920426, "epoch": 0.66025641, "global_step/max_steps": "103/156", "percentage": "66.03%", "elapsed_time": "2h 19m 34s", "remaining_time": "1h 11m 49s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012299}
104
+ {"loss": 0.53812402, "grad_norm": 4.37309245, "learning_rate": 2.7e-07, "token_acc": 0.83758545, "epoch": 0.66666667, "global_step/max_steps": "104/156", "percentage": "66.67%", "elapsed_time": "2h 20m 48s", "remaining_time": "1h 10m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01231}
105
+ {"loss": 0.51737601, "grad_norm": 4.15818775, "learning_rate": 2.7e-07, "token_acc": 0.84191114, "epoch": 0.67307692, "global_step/max_steps": "105/156", "percentage": "67.31%", "elapsed_time": "2h 22m 1s", "remaining_time": "1h 8m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012322}
106
+ {"loss": 0.52918124, "grad_norm": 4.36027095, "learning_rate": 2.6e-07, "token_acc": 0.84818482, "epoch": 0.67948718, "global_step/max_steps": "106/156", "percentage": "67.95%", "elapsed_time": "2h 23m 15s", "remaining_time": "1h 7m 34s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012332}
107
+ {"loss": 0.54951406, "grad_norm": 4.39465482, "learning_rate": 2.5e-07, "token_acc": 0.83936614, "epoch": 0.68589744, "global_step/max_steps": "107/156", "percentage": "68.59%", "elapsed_time": "2h 24m 37s", "remaining_time": "1h 6m 13s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01233}
108
+ {"loss": 0.52851987, "grad_norm": 4.48237752, "learning_rate": 2.4e-07, "token_acc": 0.84538782, "epoch": 0.69230769, "global_step/max_steps": "108/156", "percentage": "69.23%", "elapsed_time": "2h 25m 51s", "remaining_time": "1h 4m 49s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012341}
109
+ {"loss": 0.49810505, "grad_norm": 4.21900446, "learning_rate": 2.3e-07, "token_acc": 0.84801209, "epoch": 0.69871795, "global_step/max_steps": "109/156", "percentage": "69.87%", "elapsed_time": "2h 27m 11s", "remaining_time": "1h 3m 27s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012343}
110
+ {"loss": 0.5449968, "grad_norm": 4.52981291, "learning_rate": 2.2e-07, "token_acc": 0.83842641, "epoch": 0.70512821, "global_step/max_steps": "110/156", "percentage": "70.51%", "elapsed_time": "2h 28m 47s", "remaining_time": "1h 2m 13s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012321}
111
+ {"loss": 0.53004676, "grad_norm": 4.26468111, "learning_rate": 2.1e-07, "token_acc": 0.84083682, "epoch": 0.71153846, "global_step/max_steps": "111/156", "percentage": "71.15%", "elapsed_time": "2h 30m 3s", "remaining_time": "1h 0m 50s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012328}
112
+ {"loss": 0.55461526, "grad_norm": 4.2244321, "learning_rate": 2e-07, "token_acc": 0.83352208, "epoch": 0.71794872, "global_step/max_steps": "112/156", "percentage": "71.79%", "elapsed_time": "2h 31m 22s", "remaining_time": "59m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012332}
113
+ {"loss": 0.59113604, "grad_norm": 4.40275179, "learning_rate": 1.9e-07, "token_acc": 0.82692927, "epoch": 0.72435897, "global_step/max_steps": "113/156", "percentage": "72.44%", "elapsed_time": "2h 32m 49s", "remaining_time": "58m 9s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012323}
114
+ {"loss": 0.54400551, "grad_norm": 4.43860454, "learning_rate": 1.9e-07, "token_acc": 0.84398371, "epoch": 0.73076923, "global_step/max_steps": "114/156", "percentage": "73.08%", "elapsed_time": "2h 33m 57s", "remaining_time": "56m 43s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012341}
115
+ {"loss": 0.52488923, "grad_norm": 4.33736014, "learning_rate": 1.8e-07, "token_acc": 0.84285957, "epoch": 0.73717949, "global_step/max_steps": "115/156", "percentage": "73.72%", "elapsed_time": "2h 35m 14s", "remaining_time": "55m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012346}
116
+ {"loss": 0.50442624, "grad_norm": 3.97877867, "learning_rate": 1.7e-07, "token_acc": 0.85209531, "epoch": 0.74358974, "global_step/max_steps": "116/156", "percentage": "74.36%", "elapsed_time": "2h 36m 31s", "remaining_time": "53m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012352}
117
+ {"loss": 0.52389383, "grad_norm": 4.20646601, "learning_rate": 1.6e-07, "token_acc": 0.85036618, "epoch": 0.75, "global_step/max_steps": "117/156", "percentage": "75.00%", "elapsed_time": "2h 37m 50s", "remaining_time": "52m 36s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012354}
118
+ {"loss": 0.48032126, "grad_norm": 4.11543544, "learning_rate": 1.5e-07, "token_acc": 0.86102718, "epoch": 0.75641026, "global_step/max_steps": "118/156", "percentage": "75.64%", "elapsed_time": "2h 39m 10s", "remaining_time": "51m 15s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012356}
119
+ {"loss": 0.50924706, "grad_norm": 4.30736574, "learning_rate": 1.5e-07, "token_acc": 0.84713054, "epoch": 0.76282051, "global_step/max_steps": "119/156", "percentage": "76.28%", "elapsed_time": "2h 40m 30s", "remaining_time": "49m 54s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012356}
120
+ {"loss": 0.5285455, "grad_norm": 4.2289696, "learning_rate": 1.4e-07, "token_acc": 0.84022439, "epoch": 0.76923077, "global_step/max_steps": "120/156", "percentage": "76.92%", "elapsed_time": "2h 42m 4s", "remaining_time": "48m 37s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01234}
121
+ {"loss": 0.51606965, "grad_norm": 4.31743708, "learning_rate": 1.3e-07, "token_acc": 0.84314388, "epoch": 0.77564103, "global_step/max_steps": "121/156", "percentage": "77.56%", "elapsed_time": "2h 43m 19s", "remaining_time": "47m 14s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012348}
122
+ {"loss": 0.58972692, "grad_norm": 4.20623043, "learning_rate": 1.2e-07, "token_acc": 0.82227379, "epoch": 0.78205128, "global_step/max_steps": "122/156", "percentage": "78.21%", "elapsed_time": "2h 44m 47s", "remaining_time": "45m 55s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012339}
123
+ {"loss": 0.52436459, "grad_norm": 4.53727067, "learning_rate": 1.2e-07, "token_acc": 0.84348273, "epoch": 0.78846154, "global_step/max_steps": "123/156", "percentage": "78.85%", "elapsed_time": "2h 46m 4s", "remaining_time": "44m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012344}
124
+ {"loss": 0.53095937, "grad_norm": 4.3389721, "learning_rate": 1.1e-07, "token_acc": 0.84100276, "epoch": 0.79487179, "global_step/max_steps": "124/156", "percentage": "79.49%", "elapsed_time": "2h 47m 31s", "remaining_time": "43m 14s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012336}
125
+ {"loss": 0.51528633, "grad_norm": 4.17226894, "learning_rate": 1e-07, "token_acc": 0.8451646, "epoch": 0.80128205, "global_step/max_steps": "125/156", "percentage": "80.13%", "elapsed_time": "2h 48m 49s", "remaining_time": "41m 52s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012341}
126
+ {"loss": 0.53967261, "grad_norm": 4.25014974, "learning_rate": 1e-07, "token_acc": 0.84366947, "epoch": 0.80769231, "global_step/max_steps": "126/156", "percentage": "80.77%", "elapsed_time": "2h 50m 7s", "remaining_time": "40m 30s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012344}
127
+ {"loss": 0.52977896, "grad_norm": 4.40560735, "learning_rate": 9e-08, "token_acc": 0.83746511, "epoch": 0.81410256, "global_step/max_steps": "127/156", "percentage": "81.41%", "elapsed_time": "2h 51m 17s", "remaining_time": "39m 6s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012357}
128
+ {"loss": 0.47736442, "grad_norm": 4.25610765, "learning_rate": 9e-08, "token_acc": 0.85747331, "epoch": 0.82051282, "global_step/max_steps": "128/156", "percentage": "82.05%", "elapsed_time": "2h 52m 29s", "remaining_time": "37m 44s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012367}
129
+ {"loss": 0.50190258, "grad_norm": 4.40234549, "learning_rate": 8e-08, "token_acc": 0.84879065, "epoch": 0.82692308, "global_step/max_steps": "129/156", "percentage": "82.69%", "elapsed_time": "2h 53m 44s", "remaining_time": "36m 21s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012374}
130
+ {"loss": 0.50948274, "grad_norm": 4.22299164, "learning_rate": 7e-08, "token_acc": 0.84777558, "epoch": 0.83333333, "global_step/max_steps": "130/156", "percentage": "83.33%", "elapsed_time": "2h 55m 10s", "remaining_time": "35m 2s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012369}
131
+ {"loss": 0.55336982, "grad_norm": 4.33197224, "learning_rate": 7e-08, "token_acc": 0.83738577, "epoch": 0.83974359, "global_step/max_steps": "131/156", "percentage": "83.97%", "elapsed_time": "2h 56m 28s", "remaining_time": "33m 40s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012372}
132
+ {"loss": 0.50672126, "grad_norm": 4.0856793, "learning_rate": 6e-08, "token_acc": 0.85186362, "epoch": 0.84615385, "global_step/max_steps": "132/156", "percentage": "84.62%", "elapsed_time": "2h 57m 38s", "remaining_time": "32m 17s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012385}
133
+ {"loss": 0.49411821, "grad_norm": 4.27345374, "learning_rate": 6e-08, "token_acc": 0.84789586, "epoch": 0.8525641, "global_step/max_steps": "133/156", "percentage": "85.26%", "elapsed_time": "2h 58m 47s", "remaining_time": "30m 55s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012398}
134
+ {"loss": 0.55297947, "grad_norm": 4.26234047, "learning_rate": 5e-08, "token_acc": 0.8371467, "epoch": 0.85897436, "global_step/max_steps": "134/156", "percentage": "85.90%", "elapsed_time": "3h 0m 11s", "remaining_time": "29m 34s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012394}
135
+ {"loss": 0.5094257, "grad_norm": 4.38213946, "learning_rate": 5e-08, "token_acc": 0.85126138, "epoch": 0.86538462, "global_step/max_steps": "135/156", "percentage": "86.54%", "elapsed_time": "3h 1m 47s", "remaining_time": "28m 16s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012377}
136
+ {"loss": 0.52112681, "grad_norm": 4.62171216, "learning_rate": 4e-08, "token_acc": 0.84375, "epoch": 0.87179487, "global_step/max_steps": "136/156", "percentage": "87.18%", "elapsed_time": "3h 3m 3s", "remaining_time": "26m 55s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012382}
137
+ {"loss": 0.49088389, "grad_norm": 4.20408874, "learning_rate": 4e-08, "token_acc": 0.85649776, "epoch": 0.87820513, "global_step/max_steps": "137/156", "percentage": "87.82%", "elapsed_time": "3h 4m 18s", "remaining_time": "25m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
138
+ {"loss": 0.50413823, "grad_norm": 4.37768117, "learning_rate": 4e-08, "token_acc": 0.8541953, "epoch": 0.88461538, "global_step/max_steps": "138/156", "percentage": "88.46%", "elapsed_time": "3h 5m 40s", "remaining_time": "24m 13s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
139
+ {"loss": 0.54804325, "grad_norm": 4.32917767, "learning_rate": 3e-08, "token_acc": 0.83703583, "epoch": 0.89102564, "global_step/max_steps": "139/156", "percentage": "89.10%", "elapsed_time": "3h 7m 0s", "remaining_time": "22m 52s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
140
+ {"loss": 0.54084694, "grad_norm": 4.19027868, "learning_rate": 3e-08, "token_acc": 0.84083045, "epoch": 0.8974359, "global_step/max_steps": "140/156", "percentage": "89.74%", "elapsed_time": "3h 8m 17s", "remaining_time": "21m 31s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012392}
141
+ {"loss": 0.52507019, "grad_norm": 4.42979992, "learning_rate": 3e-08, "token_acc": 0.84048975, "epoch": 0.90384615, "global_step/max_steps": "141/156", "percentage": "90.38%", "elapsed_time": "3h 9m 34s", "remaining_time": "20m 10s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012396}
142
+ {"loss": 0.49129876, "grad_norm": 4.24055645, "learning_rate": 2e-08, "token_acc": 0.85284394, "epoch": 0.91025641, "global_step/max_steps": "142/156", "percentage": "91.03%", "elapsed_time": "3h 10m 56s", "remaining_time": "18m 49s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012395}
143
+ {"loss": 0.51070774, "grad_norm": 4.0944321, "learning_rate": 2e-08, "token_acc": 0.84804004, "epoch": 0.91666667, "global_step/max_steps": "143/156", "percentage": "91.67%", "elapsed_time": "3h 12m 10s", "remaining_time": "17m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012401}
144
+ {"loss": 0.48216087, "grad_norm": 4.09610548, "learning_rate": 2e-08, "token_acc": 0.85173351, "epoch": 0.92307692, "global_step/max_steps": "144/156", "percentage": "92.31%", "elapsed_time": "3h 13m 37s", "remaining_time": "16m 8s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012395}
145
+ {"loss": 0.48092702, "grad_norm": 3.99988943, "learning_rate": 1e-08, "token_acc": 0.85898077, "epoch": 0.92948718, "global_step/max_steps": "145/156", "percentage": "92.95%", "elapsed_time": "3h 15m 2s", "remaining_time": "14m 47s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012391}
146
+ {"loss": 0.47554287, "grad_norm": 4.31274229, "learning_rate": 1e-08, "token_acc": 0.86201912, "epoch": 0.93589744, "global_step/max_steps": "146/156", "percentage": "93.59%", "elapsed_time": "3h 16m 24s", "remaining_time": "13m 27s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012389}
147
+ {"loss": 0.53879178, "grad_norm": 4.53572821, "learning_rate": 1e-08, "token_acc": 0.83533359, "epoch": 0.94230769, "global_step/max_steps": "147/156", "percentage": "94.23%", "elapsed_time": "3h 17m 41s", "remaining_time": "12m 6s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012393}
148
+ {"loss": 0.48995703, "grad_norm": 4.16794684, "learning_rate": 1e-08, "token_acc": 0.85930586, "epoch": 0.94871795, "global_step/max_steps": "148/156", "percentage": "94.87%", "elapsed_time": "3h 19m 1s", "remaining_time": "10m 45s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012394}
149
+ {"loss": 0.54540122, "grad_norm": 4.23517781, "learning_rate": 1e-08, "token_acc": 0.84179008, "epoch": 0.95512821, "global_step/max_steps": "149/156", "percentage": "95.51%", "elapsed_time": "3h 20m 17s", "remaining_time": "9m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012399}
150
+ {"loss": 0.52318221, "grad_norm": 4.2966771, "learning_rate": 0.0, "token_acc": 0.84380364, "epoch": 0.96153846, "global_step/max_steps": "150/156", "percentage": "96.15%", "elapsed_time": "3h 21m 43s", "remaining_time": "8m 4s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012394}
151
+ {"loss": 0.51414204, "grad_norm": 4.32346855, "learning_rate": 0.0, "token_acc": 0.84598291, "epoch": 0.96794872, "global_step/max_steps": "151/156", "percentage": "96.79%", "elapsed_time": "3h 22m 52s", "remaining_time": "6m 43s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012405}
152
+ {"loss": 0.53818375, "grad_norm": 4.28618818, "learning_rate": 0.0, "token_acc": 0.83853394, "epoch": 0.97435897, "global_step/max_steps": "152/156", "percentage": "97.44%", "elapsed_time": "3h 24m 8s", "remaining_time": "5m 22s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01241}
153
+ {"loss": 0.51649928, "grad_norm": 4.39523061, "learning_rate": 0.0, "token_acc": 0.8487395, "epoch": 0.98076923, "global_step/max_steps": "153/156", "percentage": "98.08%", "elapsed_time": "3h 25m 33s", "remaining_time": "4m 1s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012406}
154
+ {"loss": 0.54061735, "grad_norm": 4.38216797, "learning_rate": 0.0, "token_acc": 0.83955288, "epoch": 0.98717949, "global_step/max_steps": "154/156", "percentage": "98.72%", "elapsed_time": "3h 26m 46s", "remaining_time": "2m 41s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012413}
155
+ {"loss": 0.50758892, "grad_norm": 4.21818431, "learning_rate": 0.0, "token_acc": 0.84441513, "epoch": 0.99358974, "global_step/max_steps": "155/156", "percentage": "99.36%", "elapsed_time": "3h 28m 22s", "remaining_time": "1m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012398}
156
+ {"loss": 0.54625416, "grad_norm": 4.95829896, "learning_rate": 0.0, "token_acc": 0.83869255, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 29m 15s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012425}
157
+ {"eval_loss": 0.55147469, "eval_runtime": 10.0275, "eval_samples_per_second": 0.898, "eval_steps_per_second": 0.199, "eval_token_acc": 0.84815109, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 29m 25s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012415}
158
+ {"eval_loss": 0.55147469, "eval_runtime": 12.1526, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.165, "eval_token_acc": 0.84815109, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 33m 23s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012184}
159
+ {"train_runtime": 13045.8923, "train_samples_per_second": 0.763, "train_steps_per_second": 0.012, "total_flos": 69981770547200.0, "train_loss": 0.63741068, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 37m 24s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.011959}
160
+ {"train_dataset": "2651.186647±828.544792, min=980.000000, max=7751.000000, size=9960", "val_dataset": "2758.222222±600.102543, min=2032.000000, max=3684.000000, size=9", "model_parameter_info": "Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.", "last_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/checkpoint-156", "best_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/checkpoint-156", "best_metric": 0.55147469, "global_step": 156, "log_history": [{"loss": 1.3332561254501343, "grad_norm": 204.5527273204477, "learning_rate": 1.25e-07, "token_acc": 0.7094771265983582, "epoch": 0.00641025641025641, "step": 1}, {"loss": 1.2919142246246338, "grad_norm": 472.01805132730743, "learning_rate": 2.5e-07, "token_acc": 0.7154364585876465, "epoch": 0.01282051282051282, "step": 2}, {"loss": 1.3103106021881104, "grad_norm": 295.9011538877219, "learning_rate": 3.75e-07, "token_acc": 0.7126436829566956, "epoch": 0.019230769230769232, "step": 3}, {"loss": 1.356910228729248, "grad_norm": 420.5023387419371, "learning_rate": 5e-07, "token_acc": 0.7017189264297485, "epoch": 0.02564102564102564, "step": 4}, {"loss": 1.3429011106491089, "grad_norm": 532.4883519493468, "learning_rate": 6.249999999999999e-07, "token_acc": 0.7035235166549683, "epoch": 0.03205128205128205, "step": 5}, {"loss": 1.3432822227478027, "grad_norm": 745.1533569281419, "learning_rate": 7.5e-07, "token_acc": 0.704049289226532, "epoch": 0.038461538461538464, "step": 6}, {"loss": 1.2372357845306396, "grad_norm": 316.9597378880501, "learning_rate": 8.75e-07, "token_acc": 0.719425618648529, "epoch": 0.04487179487179487, "step": 7}, {"loss": 1.1821542978286743, "grad_norm": 80.89470277766097, "learning_rate": 1e-06, "token_acc": 0.7098754644393921, "epoch": 0.05128205128205128, "step": 8}, {"loss": 1.19064462184906, "grad_norm": 194.43529650602088, "learning_rate": 9.998873580873846e-07, "token_acc": 0.706679105758667, "epoch": 0.057692307692307696, "step": 9}, {"loss": 1.1448092460632324, "grad_norm": 99.3947168866689, "learning_rate": 9.995494831023408e-07, "token_acc": 0.7132793068885803, "epoch": 0.0641025641025641, "step": 10}, {"loss": 1.0852806568145752, "grad_norm": 84.23475207475708, "learning_rate": 9.989865272804063e-07, "token_acc": 0.7265625, "epoch": 0.07051282051282051, "step": 11}, {"loss": 1.032708764076233, "grad_norm": 85.35629370266734, "learning_rate": 9.98198744271263e-07, "token_acc": 0.7359824776649475, "epoch": 0.07692307692307693, "step": 12}, {"loss": 1.0187692642211914, "grad_norm": 29.61142573444334, "learning_rate": 9.971864890244513e-07, "token_acc": 0.7330501675605774, "epoch": 0.08333333333333333, "step": 13}, {"loss": 1.0328643321990967, "grad_norm": 53.80220392943845, "learning_rate": 9.959502176294382e-07, "token_acc": 0.733631432056427, "epoch": 0.08974358974358974, "step": 14}, {"loss": 0.9353116750717163, "grad_norm": 26.63791401902767, "learning_rate": 9.944904871101226e-07, "token_acc": 0.7549552917480469, "epoch": 0.09615384615384616, "step": 15}, {"loss": 0.8937969207763672, "grad_norm": 241.8763036480847, "learning_rate": 9.928079551738541e-07, "token_acc": 0.7634074687957764, "epoch": 0.10256410256410256, "step": 16}, {"loss": 0.9060311317443848, "grad_norm": 260.0451355494914, "learning_rate": 9.909033799150946e-07, "token_acc": 0.7678724527359009, "epoch": 0.10897435897435898, "step": 17}, {"loss": 0.8835141658782959, "grad_norm": 13.610113077700959, "learning_rate": 9.887776194738431e-07, "token_acc": 0.7689903378486633, "epoch": 0.11538461538461539, "step": 18}, {"loss": 0.8617175817489624, "grad_norm": 13.22296824616019, "learning_rate": 9.864316316489872e-07, "token_acc": 0.7724306583404541, "epoch": 0.12179487179487179, "step": 19}, {"loss": 0.834773063659668, "grad_norm": 10.539571124679378, "learning_rate": 9.838664734667495e-07, "token_acc": 0.7823472023010254, "epoch": 0.1282051282051282, "step": 20}, {"loss": 0.8732649087905884, "grad_norm": 10.157244927886453, "learning_rate": 9.810833007044246e-07, "token_acc": 0.7687795162200928, "epoch": 0.1346153846153846, "step": 21}, {"loss": 0.8539620041847229, "grad_norm": 8.895664285984349, "learning_rate": 9.780833673696254e-07, "token_acc": 0.7716691493988037, "epoch": 0.14102564102564102, "step": 22}, {"loss": 0.8183756470680237, "grad_norm": 8.046583911365952, "learning_rate": 9.748680251352658e-07, "token_acc": 0.777586817741394, "epoch": 0.14743589743589744, "step": 23}, {"loss": 0.7400453090667725, "grad_norm": 6.409869446484142, "learning_rate": 9.71438722730542e-07, "token_acc": 0.7995849847793579, "epoch": 0.15384615384615385, "step": 24}, {"loss": 0.7597711682319641, "grad_norm": 6.146343328005812, "learning_rate": 9.67797005288181e-07, "token_acc": 0.7876543402671814, "epoch": 0.16025641025641027, "step": 25}, {"loss": 0.7657437324523926, "grad_norm": 6.460835549120558, "learning_rate": 9.639445136482546e-07, "token_acc": 0.7866899967193604, "epoch": 0.16666666666666666, "step": 26}, {"loss": 0.7088508009910583, "grad_norm": 5.989613299220979, "learning_rate": 9.598829836188693e-07, "token_acc": 0.8035452961921692, "epoch": 0.17307692307692307, "step": 27}, {"loss": 0.6949191093444824, "grad_norm": 6.862877869409243, "learning_rate": 9.556142451940679e-07, "token_acc": 0.8057082295417786, "epoch": 0.1794871794871795, "step": 28}, {"loss": 0.6835302114486694, "grad_norm": 5.454999319627048, "learning_rate": 9.511402217292925e-07, "token_acc": 0.8084285259246826, "epoch": 0.1858974358974359, "step": 29}, {"loss": 0.6599457263946533, "grad_norm": 5.4625192461823975, "learning_rate": 9.464629290747842e-07, "token_acc": 0.8131645321846008, "epoch": 0.19230769230769232, "step": 30}, {"loss": 0.6833065748214722, "grad_norm": 5.908943864151192, "learning_rate": 9.415844746673046e-07, "token_acc": 0.8138014078140259, "epoch": 0.1987179487179487, "step": 31}, {"loss": 0.6861380338668823, "grad_norm": 5.288567441191335, "learning_rate": 9.36507056580594e-07, "token_acc": 0.8055645823478699, "epoch": 0.20512820512820512, "step": 32}, {"loss": 0.633919894695282, "grad_norm": 5.357065287460616, "learning_rate": 9.312329625349901e-07, "token_acc": 0.818831741809845, "epoch": 0.21153846153846154, "step": 33}, {"loss": 0.6426697969436646, "grad_norm": 5.160602546431693, "learning_rate": 9.257645688666555e-07, "token_acc": 0.817148745059967, "epoch": 0.21794871794871795, "step": 34}, {"loss": 0.6970824003219604, "grad_norm": 4.864703496205445, "learning_rate": 9.201043394568771e-07, "token_acc": 0.8054612874984741, "epoch": 0.22435897435897437, "step": 35}, {"loss": 0.6301164627075195, "grad_norm": 5.209842274657687, "learning_rate": 9.14254824621921e-07, "token_acc": 0.8198198080062866, "epoch": 0.23076923076923078, "step": 36}, {"loss": 0.6474230885505676, "grad_norm": 5.060855111856781, "learning_rate": 9.082186599639427e-07, "token_acc": 0.8159399032592773, "epoch": 0.23717948717948717, "step": 37}, {"loss": 0.6971738338470459, "grad_norm": 4.618117753589896, "learning_rate": 9.019985651834703e-07, "token_acc": 0.805202841758728, "epoch": 0.24358974358974358, "step": 38}, {"loss": 0.6257902383804321, "grad_norm": 4.667129721936595, "learning_rate": 8.955973428539942e-07, "token_acc": 0.8184399604797363, "epoch": 0.25, "step": 39}, {"loss": 0.6510574221611023, "grad_norm": 4.758674560079032, "learning_rate": 8.890178771592197e-07, "token_acc": 0.8164493441581726, "epoch": 0.2564102564102564, "step": 40}, {"loss": 0.636375367641449, "grad_norm": 4.886139426025786, "learning_rate": 8.822631325935463e-07, "token_acc": 0.8183852434158325, "epoch": 0.26282051282051283, "step": 41}, {"loss": 0.6564359664916992, "grad_norm": 4.736579679060144, "learning_rate": 8.753361526263621e-07, "token_acc": 0.8196941614151001, "epoch": 0.2692307692307692, "step": 42}, {"loss": 0.638996958732605, "grad_norm": 4.5575813316907805, "learning_rate": 8.68240058330756e-07, "token_acc": 0.8199967741966248, "epoch": 0.27564102564102566, "step": 43}, {"loss": 0.6362872123718262, "grad_norm": 4.994153842899941, "learning_rate": 8.609780469772621e-07, "token_acc": 0.8194926977157593, "epoch": 0.28205128205128205, "step": 44}, {"loss": 0.6394486427307129, "grad_norm": 4.549016612724467, "learning_rate": 8.535533905932737e-07, "token_acc": 0.8174207210540771, "epoch": 0.28846153846153844, "step": 45}, {"loss": 0.6170175075531006, "grad_norm": 4.769737749020403, "learning_rate": 8.459694344887731e-07, "token_acc": 0.8237839937210083, "epoch": 0.2948717948717949, "step": 46}, {"loss": 0.5933050513267517, "grad_norm": 4.680957748858921, "learning_rate": 8.382295957490435e-07, "token_acc": 0.829018235206604, "epoch": 0.30128205128205127, "step": 47}, {"loss": 0.6095594167709351, "grad_norm": 4.52158152824765, "learning_rate": 8.303373616950406e-07, "token_acc": 0.8250677585601807, "epoch": 0.3076923076923077, "step": 48}, {"loss": 0.586349368095398, "grad_norm": 4.8131319269060135, "learning_rate": 8.222962883121195e-07, "token_acc": 0.8342939615249634, "epoch": 0.3141025641025641, "step": 49}, {"loss": 0.6297792196273804, "grad_norm": 4.874040165169469, "learning_rate": 8.141099986478212e-07, "token_acc": 0.8187986016273499, "epoch": 0.32051282051282054, "step": 50}, {"loss": 0.5949305295944214, "grad_norm": 4.942277020634408, "learning_rate": 8.057821811794457e-07, "token_acc": 0.8311688303947449, "epoch": 0.3269230769230769, "step": 51}, {"loss": 0.5739504098892212, "grad_norm": 4.3953959699546505, "learning_rate": 7.973165881521433e-07, "token_acc": 0.8309744596481323, "epoch": 0.3333333333333333, "step": 52}, {"loss": 0.6172730326652527, "grad_norm": 4.735913113697843, "learning_rate": 7.88717033888274e-07, "token_acc": 0.821295976638794, "epoch": 0.33974358974358976, "step": 53}, {"loss": 0.5573772192001343, "grad_norm": 4.378256610554005, "learning_rate": 7.799873930687977e-07, "token_acc": 0.8451364636421204, "epoch": 0.34615384615384615, "step": 54}, {"loss": 0.5717079639434814, "grad_norm": 4.684218984763407, "learning_rate": 7.711315989874676e-07, "token_acc": 0.838281512260437, "epoch": 0.3525641025641026, "step": 55}, {"loss": 0.59979248046875, "grad_norm": 4.4280118932792325, "learning_rate": 7.621536417786158e-07, "token_acc": 0.8215728998184204, "epoch": 0.358974358974359, "step": 56}, {"loss": 0.554166316986084, "grad_norm": 4.826729862989936, "learning_rate": 7.530575666193282e-07, "token_acc": 0.8405694961547852, "epoch": 0.36538461538461536, "step": 57}, {"loss": 0.5708739161491394, "grad_norm": 4.359263862107659, "learning_rate": 7.438474719068173e-07, "token_acc": 0.834990382194519, "epoch": 0.3717948717948718, "step": 58}, {"loss": 0.5425753593444824, "grad_norm": 4.828866436495037, "learning_rate": 7.345275074118185e-07, "token_acc": 0.8373525142669678, "epoch": 0.3782051282051282, "step": 59}, {"loss": 0.5623891353607178, "grad_norm": 4.598613239569117, "learning_rate": 7.251018724088366e-07, "token_acc": 0.8328151702880859, "epoch": 0.38461538461538464, "step": 60}, {"loss": 0.5742952227592468, "grad_norm": 4.840462028626115, "learning_rate": 7.155748137840892e-07, "token_acc": 0.8290673494338989, "epoch": 0.391025641025641, "step": 61}, {"loss": 0.586388885974884, "grad_norm": 4.23328233570671, "learning_rate": 7.059506241219964e-07, "token_acc": 0.8265019059181213, "epoch": 0.3974358974358974, "step": 62}, {"loss": 0.5368714332580566, "grad_norm": 4.393885373170824, "learning_rate": 6.962336397710819e-07, "token_acc": 0.8431768417358398, "epoch": 0.40384615384615385, "step": 63}, {"loss": 0.6058262586593628, "grad_norm": 4.662530184420074, "learning_rate": 6.864282388901543e-07, "token_acc": 0.8214831352233887, "epoch": 0.41025641025641024, "step": 64}, {"loss": 0.5562135577201843, "grad_norm": 4.558811999067238, "learning_rate": 6.765388394756504e-07, "token_acc": 0.8388559222221375, "epoch": 0.4166666666666667, "step": 65}, {"loss": 0.5610250234603882, "grad_norm": 4.765395712458729, "learning_rate": 6.665698973710288e-07, "token_acc": 0.8305501937866211, "epoch": 0.4230769230769231, "step": 66}, {"loss": 0.5418926477432251, "grad_norm": 4.5677446123493635, "learning_rate": 6.565259042591111e-07, "token_acc": 0.8431202173233032, "epoch": 0.42948717948717946, "step": 67}, {"loss": 0.5533748865127563, "grad_norm": 4.529946815527597, "learning_rate": 6.464113856382751e-07, "token_acc": 0.8329079747200012, "epoch": 0.4358974358974359, "step": 68}, {"loss": 0.5477522015571594, "grad_norm": 4.457221273065978, "learning_rate": 6.362308987834115e-07, "token_acc": 0.8378193974494934, "epoch": 0.4423076923076923, "step": 69}, {"loss": 0.5534911751747131, "grad_norm": 4.611531783389826, "learning_rate": 6.259890306925626e-07, "token_acc": 0.841196596622467, "epoch": 0.44871794871794873, "step": 70}, {"loss": 0.5345719456672668, "grad_norm": 4.418661988438471, "learning_rate": 6.156903960201708e-07, "token_acc": 0.8396868705749512, "epoch": 0.4551282051282051, "step": 71}, {"loss": 0.573357343673706, "grad_norm": 4.438614534821985, "learning_rate": 6.053396349978631e-07, "token_acc": 0.8328611850738525, "epoch": 0.46153846153846156, "step": 72}, {"loss": 0.548120379447937, "grad_norm": 4.304860173689802, "learning_rate": 5.949414113437141e-07, "token_acc": 0.8377840518951416, "epoch": 0.46794871794871795, "step": 73}, {"loss": 0.554309606552124, "grad_norm": 4.194319397177033, "learning_rate": 5.845004101609246e-07, "token_acc": 0.8362563848495483, "epoch": 0.47435897435897434, "step": 74}, {"loss": 0.5796681642532349, "grad_norm": 4.6298008452210615, "learning_rate": 5.740213358268658e-07, "token_acc": 0.8328510522842407, "epoch": 0.4807692307692308, "step": 75}, {"loss": 0.5438037514686584, "grad_norm": 4.355472281518206, "learning_rate": 5.635089098734393e-07, "token_acc": 0.8434426188468933, "epoch": 0.48717948717948717, "step": 76}, {"loss": 0.5334382653236389, "grad_norm": 4.627740617106088, "learning_rate": 5.52967868859708e-07, "token_acc": 0.8434890508651733, "epoch": 0.4935897435897436, "step": 77}, {"loss": 0.5313788652420044, "grad_norm": 4.2780408616986945, "learning_rate": 5.424029622377546e-07, "token_acc": 0.8449059128761292, "epoch": 0.5, "step": 78}, {"loss": 0.5232336521148682, "grad_norm": 4.092855208873091, "learning_rate": 5.318189502127331e-07, "token_acc": 0.8468860983848572, "epoch": 0.5064102564102564, "step": 79}, {"loss": 0.5497955083847046, "grad_norm": 4.428993590455852, "learning_rate": 5.212206015980741e-07, "token_acc": 0.8446012735366821, "epoch": 0.5128205128205128, "step": 80}, {"loss": 0.546926736831665, "grad_norm": 4.21577617143911, "learning_rate": 5.106126916668118e-07, "token_acc": 0.8366925716400146, "epoch": 0.5192307692307693, "step": 81}, {"loss": 0.5270861387252808, "grad_norm": 4.438862747258348, "learning_rate": 5e-07, "token_acc": 0.8371419310569763, "epoch": 0.5256410256410257, "step": 82}, {"loss": 0.5339565277099609, "grad_norm": 4.176826355296084, "learning_rate": 4.893873083331882e-07, "token_acc": 0.8417102098464966, "epoch": 0.532051282051282, "step": 83}, {"loss": 0.5424758195877075, "grad_norm": 4.530198529065255, "learning_rate": 4.787793984019259e-07, "token_acc": 0.8395857810974121, "epoch": 0.5384615384615384, "step": 84}, {"loss": 0.5200133919715881, "grad_norm": 4.370509876032819, "learning_rate": 4.681810497872668e-07, "token_acc": 0.8433917760848999, "epoch": 0.5448717948717948, "step": 85}, {"loss": 0.5480057597160339, "grad_norm": 4.382905385719549, "learning_rate": 4.5759703776224555e-07, "token_acc": 0.8397650718688965, "epoch": 0.5512820512820513, "step": 86}, {"loss": 0.5228407382965088, "grad_norm": 4.342442518129754, "learning_rate": 4.47032131140292e-07, "token_acc": 0.8539518713951111, "epoch": 0.5576923076923077, "step": 87}, {"loss": 0.5413128137588501, "grad_norm": 4.137341022557355, "learning_rate": 4.364910901265606e-07, "token_acc": 0.8368589878082275, "epoch": 0.5641025641025641, "step": 88}, {"loss": 0.5160300731658936, "grad_norm": 4.369749651287342, "learning_rate": 4.259786641731343e-07, "token_acc": 0.8478597402572632, "epoch": 0.5705128205128205, "step": 89}, {"loss": 0.5385915637016296, "grad_norm": 4.5684142339974585, "learning_rate": 4.154995898390755e-07, "token_acc": 0.8457493185997009, "epoch": 0.5769230769230769, "step": 90}, {"loss": 0.5259469747543335, "grad_norm": 4.263077787933313, "learning_rate": 4.0505858865628575e-07, "token_acc": 0.8433637022972107, "epoch": 0.5833333333333334, "step": 91}, {"loss": 0.5241655111312866, "grad_norm": 4.246658460460572, "learning_rate": 3.94660365002137e-07, "token_acc": 0.8478260636329651, "epoch": 0.5897435897435898, "step": 92}, {"loss": 0.5464353561401367, "grad_norm": 4.4264562426283325, "learning_rate": 3.8430960397982926e-07, "token_acc": 0.8348221778869629, "epoch": 0.5961538461538461, "step": 93}, {"loss": 0.5346203446388245, "grad_norm": 4.571352573379081, "learning_rate": 3.7401096930743746e-07, "token_acc": 0.8426615595817566, "epoch": 0.6025641025641025, "step": 94}, {"loss": 0.4898928999900818, "grad_norm": 4.302883692806041, "learning_rate": 3.637691012165886e-07, "token_acc": 0.8545727133750916, "epoch": 0.6089743589743589, "step": 95}, {"loss": 0.5597435235977173, "grad_norm": 4.488514668810176, "learning_rate": 3.535886143617248e-07, "token_acc": 0.8423628211021423, "epoch": 0.6153846153846154, "step": 96}, {"loss": 0.5908330082893372, "grad_norm": 4.5047373126179036, "learning_rate": 3.4347409574088894e-07, "token_acc": 0.831405758857727, "epoch": 0.6217948717948718, "step": 97}, {"loss": 0.5422452092170715, "grad_norm": 4.326989989523886, "learning_rate": 3.334301026289712e-07, "token_acc": 0.8455585837364197, "epoch": 0.6282051282051282, "step": 98}, {"loss": 0.4947037994861603, "grad_norm": 4.071817598010613, "learning_rate": 3.234611605243496e-07, "token_acc": 0.8524978756904602, "epoch": 0.6346153846153846, "step": 99}, {"loss": 0.4820348620414734, "grad_norm": 4.364292580374463, "learning_rate": 3.135717611098457e-07, "token_acc": 0.8573145866394043, "epoch": 0.6410256410256411, "step": 100}, {"loss": 0.577752411365509, "grad_norm": 4.245101360198055, "learning_rate": 3.037663602289181e-07, "token_acc": 0.8319244384765625, "epoch": 0.6474358974358975, "step": 101}, {"loss": 0.5117242336273193, "grad_norm": 4.3455778338843, "learning_rate": 2.940493758780037e-07, "token_acc": 0.8504513502120972, "epoch": 0.6538461538461539, "step": 102}, {"loss": 0.5905250310897827, "grad_norm": 4.354835545935755, "learning_rate": 2.8442518621591084e-07, "token_acc": 0.829204261302948, "epoch": 0.6602564102564102, "step": 103}, {"loss": 0.5381240248680115, "grad_norm": 4.373092445004486, "learning_rate": 2.748981275911633e-07, "token_acc": 0.83758544921875, "epoch": 0.6666666666666666, "step": 104}, {"loss": 0.5173760056495667, "grad_norm": 4.158187745451947, "learning_rate": 2.6547249258818163e-07, "token_acc": 0.8419111371040344, "epoch": 0.6730769230769231, "step": 105}, {"loss": 0.5291812419891357, "grad_norm": 4.360270954918175, "learning_rate": 2.561525280931828e-07, "token_acc": 0.8481848239898682, "epoch": 0.6794871794871795, "step": 106}, {"loss": 0.5495140552520752, "grad_norm": 4.394654818317897, "learning_rate": 2.469424333806718e-07, "token_acc": 0.8393661379814148, "epoch": 0.6858974358974359, "step": 107}, {"loss": 0.528519868850708, "grad_norm": 4.4823775203281, "learning_rate": 2.378463582213842e-07, "token_acc": 0.8453878164291382, "epoch": 0.6923076923076923, "step": 108}, {"loss": 0.4981050491333008, "grad_norm": 4.219004456899406, "learning_rate": 2.2886840101253247e-07, "token_acc": 0.8480120897293091, "epoch": 0.6987179487179487, "step": 109}, {"loss": 0.5449967980384827, "grad_norm": 4.529812908764033, "learning_rate": 2.2001260693120232e-07, "token_acc": 0.838426411151886, "epoch": 0.7051282051282052, "step": 110}, {"loss": 0.5300467610359192, "grad_norm": 4.264681112622175, "learning_rate": 2.112829661117259e-07, "token_acc": 0.8408368229866028, "epoch": 0.7115384615384616, "step": 111}, {"loss": 0.5546152591705322, "grad_norm": 4.224432096547358, "learning_rate": 2.026834118478567e-07, "token_acc": 0.8335220813751221, "epoch": 0.717948717948718, "step": 112}, {"loss": 0.5911360383033752, "grad_norm": 4.40275179389628, "learning_rate": 1.9421781882055443e-07, "token_acc": 0.8269292712211609, "epoch": 0.7243589743589743, "step": 113}, {"loss": 0.5440055131912231, "grad_norm": 4.438604539947117, "learning_rate": 1.858900013521788e-07, "token_acc": 0.8439837098121643, "epoch": 0.7307692307692307, "step": 114}, {"loss": 0.5248892307281494, "grad_norm": 4.3373601365266605, "learning_rate": 1.777037116878804e-07, "token_acc": 0.8428595662117004, "epoch": 0.7371794871794872, "step": 115}, {"loss": 0.5044262409210205, "grad_norm": 3.9787786664284495, "learning_rate": 1.6966263830495935e-07, "token_acc": 0.8520953059196472, "epoch": 0.7435897435897436, "step": 116}, {"loss": 0.5238938331604004, "grad_norm": 4.206466011537137, "learning_rate": 1.6177040425095663e-07, "token_acc": 0.8503661751747131, "epoch": 0.75, "step": 117}, {"loss": 0.4803212583065033, "grad_norm": 4.11543543662316, "learning_rate": 1.5403056551122694e-07, "token_acc": 0.861027181148529, "epoch": 0.7564102564102564, "step": 118}, {"loss": 0.5092470645904541, "grad_norm": 4.307365738144286, "learning_rate": 1.4644660940672627e-07, "token_acc": 0.847130537033081, "epoch": 0.7628205128205128, "step": 119}, {"loss": 0.5285454988479614, "grad_norm": 4.228969596612662, "learning_rate": 1.3902195302273778e-07, "token_acc": 0.8402243852615356, "epoch": 0.7692307692307693, "step": 120}, {"loss": 0.5160696506500244, "grad_norm": 4.317437082837098, "learning_rate": 1.3175994166924392e-07, "token_acc": 0.843143880367279, "epoch": 0.7756410256410257, "step": 121}, {"loss": 0.5897269248962402, "grad_norm": 4.206230433373526, "learning_rate": 1.2466384737363779e-07, "token_acc": 0.8222737908363342, "epoch": 0.782051282051282, "step": 122}, {"loss": 0.5243645906448364, "grad_norm": 4.537270673759693, "learning_rate": 1.1773686740645383e-07, "token_acc": 0.8434827327728271, "epoch": 0.7884615384615384, "step": 123}, {"loss": 0.5309593677520752, "grad_norm": 4.338972097867842, "learning_rate": 1.1098212284078035e-07, "token_acc": 0.8410027623176575, "epoch": 0.7948717948717948, "step": 124}, {"loss": 0.5152863264083862, "grad_norm": 4.172268942442711, "learning_rate": 1.0440265714600571e-07, "token_acc": 0.8451645970344543, "epoch": 0.8012820512820513, "step": 125}, {"loss": 0.5396726131439209, "grad_norm": 4.250149736675509, "learning_rate": 9.800143481652979e-08, "token_acc": 0.8436694741249084, "epoch": 0.8076923076923077, "step": 126}, {"loss": 0.5297789573669434, "grad_norm": 4.405607345694319, "learning_rate": 9.17813400360572e-08, "token_acc": 0.8374651074409485, "epoch": 0.8141025641025641, "step": 127}, {"loss": 0.4773644208908081, "grad_norm": 4.256107645791666, "learning_rate": 8.574517537807896e-08, "token_acc": 0.8574733138084412, "epoch": 0.8205128205128205, "step": 128}, {"loss": 0.5019025802612305, "grad_norm": 4.402345485095196, "learning_rate": 7.989566054312286e-08, "token_acc": 0.8487906455993652, "epoch": 0.8269230769230769, "step": 129}, {"loss": 0.509482741355896, "grad_norm": 4.222991640512232, "learning_rate": 7.423543113334435e-08, "token_acc": 0.8477755784988403, "epoch": 0.8333333333333334, "step": 130}, {"loss": 0.5533698201179504, "grad_norm": 4.331972240710486, "learning_rate": 6.876703746500984e-08, "token_acc": 0.8373857736587524, "epoch": 0.8397435897435898, "step": 131}, {"loss": 0.5067212581634521, "grad_norm": 4.085679298862501, "learning_rate": 6.349294341940592e-08, "token_acc": 0.8518636226654053, "epoch": 0.8461538461538461, "step": 132}, {"loss": 0.49411821365356445, "grad_norm": 4.273453737642771, "learning_rate": 5.8415525332695334e-08, "token_acc": 0.8478958606719971, "epoch": 0.8525641025641025, "step": 133}, {"loss": 0.5529794692993164, "grad_norm": 4.262340473425649, "learning_rate": 5.353707092521581e-08, "token_acc": 0.8371466994285583, "epoch": 0.8589743589743589, "step": 134}, {"loss": 0.509425699710846, "grad_norm": 4.382139460926817, "learning_rate": 4.885977827070747e-08, "token_acc": 0.8512613773345947, "epoch": 0.8653846153846154, "step": 135}, {"loss": 0.5211268067359924, "grad_norm": 4.62171216340832, "learning_rate": 4.4385754805932095e-08, "token_acc": 0.84375, "epoch": 0.8717948717948718, "step": 136}, {"loss": 0.49088388681411743, "grad_norm": 4.204088741197678, "learning_rate": 4.0117016381130634e-08, "token_acc": 0.8564977645874023, "epoch": 0.8782051282051282, "step": 137}, {"loss": 0.5041382312774658, "grad_norm": 4.377681174151091, "learning_rate": 3.6055486351745324e-08, "token_acc": 0.8541952967643738, "epoch": 0.8846153846153846, "step": 138}, {"loss": 0.5480432510375977, "grad_norm": 4.32917767077044, "learning_rate": 3.220299471181898e-08, "token_acc": 0.8370358347892761, "epoch": 0.8910256410256411, "step": 139}, {"loss": 0.5408469438552856, "grad_norm": 4.190278678307778, "learning_rate": 2.8561277269457895e-08, "token_acc": 0.8408304452896118, "epoch": 0.8974358974358975, "step": 140}, {"loss": 0.5250701904296875, "grad_norm": 4.429799923194152, "learning_rate": 2.513197486473406e-08, "token_acc": 0.8404897451400757, "epoch": 0.9038461538461539, "step": 141}, {"loss": 0.49129876494407654, "grad_norm": 4.240556447333352, "learning_rate": 2.1916632630374577e-08, "token_acc": 0.8528439402580261, "epoch": 0.9102564102564102, "step": 142}, {"loss": 0.5107077360153198, "grad_norm": 4.094432099971454, "learning_rate": 1.8916699295575323e-08, "token_acc": 0.8480400443077087, "epoch": 0.9166666666666666, "step": 143}, {"loss": 0.48216086626052856, "grad_norm": 4.096105481468722, "learning_rate": 1.6133526533250563e-08, "token_acc": 0.8517335057258606, "epoch": 0.9230769230769231, "step": 144}, {"loss": 0.4809270203113556, "grad_norm": 3.999889432088483, "learning_rate": 1.3568368351012716e-08, "token_acc": 0.8589807748794556, "epoch": 0.9294871794871795, "step": 145}, {"loss": 0.4755428731441498, "grad_norm": 4.312742288765351, "learning_rate": 1.1222380526156927e-08, "token_acc": 0.8620191216468811, "epoch": 0.9358974358974359, "step": 146}, {"loss": 0.5387917757034302, "grad_norm": 4.535728213741053, "learning_rate": 9.096620084905471e-09, "token_acc": 0.8353335857391357, "epoch": 0.9423076923076923, "step": 147}, {"loss": 0.4899570345878601, "grad_norm": 4.1679468407507905, "learning_rate": 7.1920448261457715e-09, "token_acc": 0.8593058586120605, "epoch": 0.9487179487179487, "step": 148}, {"loss": 0.5454012155532837, "grad_norm": 4.2351778138083604, "learning_rate": 5.509512889877332e-09, "token_acc": 0.8417900800704956, "epoch": 0.9551282051282052, "step": 149}, {"loss": 0.523182213306427, "grad_norm": 4.296677102692333, "learning_rate": 4.049782370561583e-09, "token_acc": 0.8438036441802979, "epoch": 0.9615384615384616, "step": 150}, {"loss": 0.5141420364379883, "grad_norm": 4.323468547344599, "learning_rate": 2.813510975548772e-09, "token_acc": 0.8459829092025757, "epoch": 0.967948717948718, "step": 151}, {"loss": 0.5381837487220764, "grad_norm": 4.286188181441807, "learning_rate": 1.8012557287367391e-09, "token_acc": 0.8385339379310608, "epoch": 0.9743589743589743, "step": 152}, {"loss": 0.5164992809295654, "grad_norm": 4.395230614166292, "learning_rate": 1.013472719593733e-09, "token_acc": 0.848739504814148, "epoch": 0.9807692307692307, "step": 153}, {"loss": 0.5406173467636108, "grad_norm": 4.382167968792246, "learning_rate": 4.5051689765929213e-10, "token_acc": 0.8395528793334961, "epoch": 0.9871794871794872, "step": 154}, {"loss": 0.5075889229774475, "grad_norm": 4.218184305013247, "learning_rate": 1.1264191261528555e-10, "token_acc": 0.8444151282310486, "epoch": 0.9935897435897436, "step": 155}, {"loss": 0.5462541580200195, "grad_norm": 4.958298958687034, "learning_rate": 0.0, "token_acc": 0.8386925458908081, "epoch": 1.0, "step": 156}, {"eval_loss": 0.5514746904373169, "eval_runtime": 10.0275, "eval_samples_per_second": 0.898, "eval_steps_per_second": 0.199, "eval_token_acc": 0.8481510877609253, "epoch": 1.0, "step": 156}, {"eval_loss": 0.5514746904373169, "eval_runtime": 12.1526, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.165, "eval_token_acc": 0.8481510877609253, "epoch": 1.0, "step": 156}, {"train_runtime": 13045.8923, "train_samples_per_second": 0.763, "train_steps_per_second": 0.012, "total_flos": 69981770547200.0, "train_loss": 0.6374106772052937, "epoch": 1.0, "step": 156}], "memory": 92.6015625}
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/runs/events.out.tfevents.1755695667.TENCENT64.site.2602616.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f775150e89c38ae07379b75d60c0df41cb1151e270a0dbe6db976e3d848e7eca
3
+ size 49552
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/val_dataset.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which of the following is known for the song Sister Havana: Urge Overkill or Ministry?\nObservation: [1947] RootWebArea 'Search: Sister Havana Urge Overkill' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=Sister+Havana+Urge+Overkill\n\t[1968] textbox \"Search 'Wikipedia'\" required: False\n\t[1974] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1975] button '🏠'\n\t[1976] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1977] button 'Wikipedia'\n\t[1978] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1979] button '🎲'\n\t[1839] StaticText 'Results 1-20 of 20 for \"Sister Havana Urge Overkill\"'\n\t[1987] link 'Saturation (Urge Overkill album)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Saturation_(Urge_Overkill_album)\n\t[1846] StaticText '...\"Sister Havana\" and \"Positive Bleeding\" as singles in the US and Europe. \"Sister Havana\" charted highly on both the modern rock and mainstream rock charts, peaking at numbers 6 and 10, respectively, while \"Positive Bleeding\" became a minor rock radio hit. Saturation Studio album by Urge Overkill Released June 8, 1993 Recorded December 17, 1992–January 9, 1993 Genre Alternative rock Length 70:01 Label Geffen Producer Butcher Bros., Andy Kravitz Urge Overkill chronology Stull (1992) Saturation......'\n\t[1855] StaticText 'from Wikipedia'\n\t[1856] StaticText '635 words'\n\t[1996] link 'Urge Overkill' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Urge_Overkill\n\t[1858] StaticText '...\"Sister Havana\" and their cover of Neil Diamond\\'s \"Girl, You\\'ll Be a Woman Soon\", which was notably used in Quentin Tarantino\\'s Pulp Fiction. Their latest album, Oui, was released in 2022. Urge Overkill Background information Origin Chicago, Illinois, United States Genres Alternative rock, hard rock Years active 1986–1997, 2004–present Labels Touch and Go, Geffen Website www.urgeoverkill.com Members Nash Kato Eddie \"King\" Roeser Nate Arling Adam Arling Past members Johnny \"Blackie Onassis\"......'\n\t[1863] StaticText 'from Wikipedia'\n\t[1864] StaticText '1,430 words'\n\t[2003] link 'Triple J Hottest 100, 1993' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Triple_J_Hottest_100,_1993\n\t[1872] StaticText '1,786 words'\n\t[2010] link 'Hit Machine' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Hit_Machine\n\t[1874] StaticText '...\"Sister Havana\" (3:43) Peter Andre – \"Let\\'s Get It On\" (4:09) Leftfield featuring John Lydon – \"Open Up\" (3:46) Xscape – \"Just Kickin\\' It\" (3:25) Ace of Base – \"Happy Nation\" (3:28) Defryme – \"Pure Killer\" (4:04) Funky Poets – \"Born in the Ghetto\" (4:18) Certifications Region Certification Certified units/sales Australia (ARIA)[7] Platinum 70,000^ ^ Shipments figures based on certification alone. Hit Machine 5 Hit Machine 5 Compilation album by Various artists Released 1994 Genre......'\n\t[1879] StaticText 'from Wikipedia'\n\t[1880] StaticText '5,688 words'\n\t[2017] link 'Reading and Leeds Festivals line-ups' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Reading_and_Leeds_Festivals_line-ups\n\t[1886] StaticText '16,832 words'\n\t[2023] link 'Glastonbury Festival line-ups' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Glastonbury_Festival_line-ups\nHISTORY_ACTION: type [328] [Sister Havana] [1]\ntype [1335] [Sister Havana Urge Overkill] [1]\n"}, {"role": "assistant", "content": "<think>The search results and snippets indicate that \"Sister Havana\" is a song by Urge Overkill. The text mentions \"Sister Havana\" as a single by Urge Overkill, and there is no mention of Ministry in connection with the song. I have enough information to answer the question.</think>\n```stop [Urge Overkill is known for the song \"Sister Havana\".]```"}]}
2
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: what class of food does egg belong to\nObservation: [1481] RootWebArea 'Search: egg' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=egg\n\t[1502] textbox \"Search 'Wikipedia'\" required: False\n\t[1508] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1509] button '🏠'\n\t[1510] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1511] button 'Wikipedia'\n\t[1512] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1513] button '🎲'\n\t[1033] StaticText 'Results 1-25 of 78,533 for \"egg\"'\n\t[1521] link 'List of egg topics' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_egg_topics\n\t[1040] StaticText '...Egg dishes and List of egg dishes A to Z egg dishes Balut (egg) Bai pong moan Boiled egg Brik Century egg Changua Chinese steamed eggs Çılbır Coddled egg Custard Deviled egg Devonshire eggs Egg bhurji Egg drop soup Egg foo young Egg in the basket Egg khagina Egg salad Egg sandwich Eggs Benedict Eggs Neptune Eggs Sardou Eggs and brains French toast Fried egg Fritaja Frittata Haminados Hangtown fry Huevos divorciados[lower-alpha 1] Huevos motuleños[lower-alpha 1] Huevos rancheros[lower-alpha 1]......'\n\t[1073] StaticText 'from Wikipedia'\n\t[1074] StaticText '907 words'\n\t[1542] link 'Boiled egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Boiled_egg\n\t[1076] StaticText '...egg white and egg yolk both solidify, while soft-boiled eggs may leave the yolk, and sometimes the white, at least partially liquid and raw. Boiled eggs are a popular breakfast food around the world. For eggs cooked without their shells in hot water, see poached egg. For other egg dishes, see list of egg dishes. Boiled egg A soft-boiled egg served in the half shell Main ingredients Eggs (typically chicken) Variations Baked eggs, starting temperature, preparation Food energy (per 100\\xa0g serving)......'\n\t[1091] StaticText 'from Wikipedia'\n\t[1092] StaticText '2,185 words'\n\t[1554] link 'Egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Egg\n\t[1094] StaticText 'Egg An egg is the organic vessel containing the zygote in which an embryo develops until it can survive on its own, at which point the animal hatches. An egg results from fertilization of an egg cell. Most arthropods, vertebrates (excluding live-bearing mammals), and mollusks lay eggs, although some, such as scorpions, do not. This article is about biological eggs. For eggs as food, see Egg as food. For other uses, see Egg (disambiguation). Eggs of various birds, a reptile, various cartilaginous......'\n\t[1106] StaticText 'from Wikipedia'\n\t[1107] StaticText '5,196 words'\n\t[1565] link 'Easter egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Easter_egg\n\t[1109] StaticText '...egg (media). Decorated Easter eggs Easter egg of the Ukrainian variety with the Paschal greeting \"Christ is Risen!\" A chocolate Easter egg Although eggs, in general, were a traditional symbol of fertility and rebirth,[2] in Christianity, for the celebration of Eastertide, Easter eggs symbolize the empty tomb of Jesus, from which Jesus was resurrected.[3][4][5] In addition, one ancient tradition was the staining of Easter eggs with the colour red \"in memory of the blood of Christ, shed as at that......'\n\t[1116] StaticText 'from Wikipedia'\n\t[1117] StaticText '6,153 words'\n\t[1573] link 'Egg as food' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Egg_as_food\n\t[1119] StaticText '...(egg white), and vitellus (egg yolk), contained within various thin membranes. The most commonly consumed eggs are chicken eggs. Other poultry eggs including those of duck and quail also are eaten. Fish eggs are called roe and caviar. \"Chicken egg\" redirects here. For the causality dilemma, see Chicken or the egg. For Egg (biology), see Egg. For other uses, see Egg (disambiguation). Fried eggs with carrot, Parmesan and cream Egg yolks and whole eggs store significant amounts of protein and......'\n\t[1136] StaticText 'from Wikipedia'\n\t[1137] StaticText '9,695 words'\n\t[1586] link 'List of egg dishes' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_egg_dishes\n\t[1139] StaticText '...(egg yolk), contained within many different thin membranes all surrounded by a protective eggshell. Popular choices for egg consumption are chicken, duck, quail, roe, caviar, and emu, but the '\nHISTORY_ACTION: type [330] [egg] [1]\n"}, {"role": "assistant", "content": "<think>The link titled 'Egg as food' is likely to contain information about the class of food to which eggs belong. Clicking this link should provide the relevant details.</think>\n```click [1573] [Egg as food]```"}]}
3
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Lawrence D. Cohen's Carrie:The Musical is an adaptation of Stephen King's novel, which centers around a socially awkward teenage girl with what newly discovered powers?\nObservation: [2717] RootWebArea 'Carrie (musical)' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)\n\t[2829] textbox \"Search 'Wikipedia'\" required: False\n\t[2835] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[2836] button '🏠'\n\t[2837] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[2838] button 'Wikipedia'\n\t[2839] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[2840] button '🎲'\n\t[2847] heading 'Carrie (musical)' hasPopup: menu\n\t[1789] StaticText 'Carrie is a '\n\t[2719] link 'musical' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Musical_theatre\n\t[1792] StaticText ' with a book by '\n\t[2854] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t[1794] StaticText ', lyrics by '\n\t[2855] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t[1796] StaticText ', and music by '\n\t[2856] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t[1798] StaticText '. Adapted from '\n\t[2857] link 'Stephen King' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stephen_King\n\t[2859] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(novel)\n\t[1802] StaticText ', it focuses on an awkward teenage girl with '\n\t[2860] link 'telekinetic powers' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Psychokinesis\n\t[1804] StaticText ' whose lonely life is dominated by an oppressive '\n\t[2861] link 'religious fanatic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Religious_fanaticism\n\t[1806] StaticText ' mother. When she is humiliated by her classmates at the high school '\n\t[2862] link 'prom' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Prom\n\t[1808] StaticText ', she unleashes chaos on everyone and everything in her path.'\n\t[2864] table ''\n\t\t[2866] row ''\n\t\t\t[2867] columnheader 'Carrie' required: False\n\t\t[2870] row ''\n\t\t\t[2871] cell 'Official poster for the original Stratford-upon-Avon production'\n\t\t\t\t[2768] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Carrie_the_Musical_Poster.jpg.webp\n\t\t[2874] row ''\n\t\t\t[2875] rowheader 'Music' required: False\n\t\t\t[2876] cell 'Michael Gore'\n\t\t\t\t[2877] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t\t[2878] row ''\n\t\t\t[2879] rowheader 'Lyrics' required: False\n\t\t\t[2880] cell 'Dean Pitchford'\n\t\t\t\t[2881] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t\t[2882] row ''\n\t\t\t[2883] rowheader 'Book' required: False\n\t\t\t\t[2884] link 'Book' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Book_(musical_theatre)\n\t\t\t[2885] cell 'Lawrence D. Cohen'\n\t\t\t\t[2886] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t\t[2887] row ''\n\t\t\t[2888] rowheader 'Basis' required: False\n\t\t\t[2889] cell 'Carrie by Stephen King'\n\t\t\t\t[2890] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(novel)\n\t[1830] StaticText 'Originally premiering in the U.K. in 1988, Carrie opened on '\n\t[2899] link 'Broadway' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Broadway_theatre\n\t[1834] StaticText ' at the Virginia Theatre the same year, but closed after 16 previews and five regular performances. Due to the passionate response from both its critics and its fans, the show is considered one of the most notable failures in '\n\t[2900] link 'Broadway theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Broadway_theatre\n\t[1836] StaticText ' history; a 1991 book written by '\n\t[2901] link 'Ken Mandelbaum' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ken_Mandelbaum\n\t[1838] StaticText ', which chronicled the history of flop Broadway musicals, was partially entitled Not Since Carrie, and a 2021 podcast, Out for Blood, documented its creation and development.'\n\t[2904] group ''\n\t\t[2909] DisclosureTriangle 'Production history' expanded: True\n\t\t\t[2910] heading 'Production history'\n\t\t[1845] StaticText 'Inspired by a 1981 performance of '\n\t\t[2912] link 'Alban Berg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alban_Berg\n\t\t[2914] link 'Lulu' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lulu_(opera)\n\t\t[1849] StaticText ' at the '\n\t\t[2915] link 'Metropolitan Opera House' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Metropolitan_Opera_House_(Lincoln_Center)\n\t\t[2917] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)#cite_note-DVD_featurette-1\n\t\t[2919] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t\t[1854] StaticText ', who wrote the script for the 1976 film version of '\n\t\t[2920] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(1976_film)\n\t\t[1856] StaticText ', and '\n\t\t[2922] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t\t[1858] StaticText ' began work on a musical based on the '\n\t\t[2923] link 'Stephen King' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stephen_King\n\t\t[2925] link 'Fame' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fame_(1980_film)\n\t\t[1862] StaticText ' collaborator, '\n\t\t[2926] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t\t[1864] StaticText ', was brought in to work on the project, which underwent numerous rewrites.'\n\t\t[2928] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)#cite_note-2\n\t\t[1865] StaticText ' In August 1984, a '\n\t\t[2930] link 'workshop' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Workshop\n\t\t[1867] StaticText ' of the first act was staged at 890 Broadway in New York City, with '\n\t\t[2931] link 'Annie Golden' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Annie_Golden\n\t\t[1869] StaticText ' as Carrie, '\n\t\t[2932] link 'Maureen McGovern' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Maureen_McGovern\n\t\t[1871] StaticText ' as Mrs. White, '\n\t\t[2933] link 'Laurie Beechman' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Laurie_Beechman\n\t\t[1873] StaticText ' as Mrs. Gardner, and '\n\t\t[2934] link 'Liz Callaway' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Liz_Callaway\n\t\t[1875] StaticText ' as Chris. It was soon announced that Carrie would be produced on '\nHISTORY_ACTION: type [327] [Carrie (musical)] [1]\nclick [1309] [Carrie (musical)]\n"}, {"role": "assistant", "content": "<think>The objective is to identify what newly discovered powers the main character, Carrie, has in Lawrence D. Cohen's adaptation. The text in the accessibility tree states: \"it focuses on an awkward teenage girl with telekinetic powers whose lonely life is dominated by an oppressive religious fanatic mother.\" This directly answers the question.</think>\n```stop [telekinetic powers]```"}]}
4
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Name the second world war trilogy written by the English writer, Evelyn Waugh, who is also known for his travel writings in the 1930s and a later travel book called 'A Tourist in Africa' which appeared in 1960?\nObservation: [292] RootWebArea 'User:The other Kiwix guy/Landing' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing\n\t[329] textbox \"Search 'Wikipedia'\" required: False\n\t[335] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[336] button '🏠'\n\t[337] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[338] button 'Wikipedia'\n\t[339] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[340] button '🎲'\n\t[3] StaticText 'Welcome to '\n\t[351] link 'Wikipedia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wikipedia\n\t[5] StaticText 'The free encyclopedia.'\n\t[6] StaticText '6,489,052'\n\t[7] StaticText ' articles in '\n\t[357] link 'English' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/English_Wikipedia\n\t[359] heading 'Arts'\n\t[361] link 'Architecture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Architecture\n\t[362] link 'Books' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Books\n\t[363] link 'Cinematography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cinematography\n\t[364] link 'Dance' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dance\n\t[365] link 'Design' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Design\n\t[366] link 'Fashion' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fashion\n\t[367] link 'Films' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Films\n\t[368] link 'Gastronomy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gastronomy\n\t[369] link 'Literature' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Literature\n\t[370] link 'Magic (illusion)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Magic_(illusion)\n\t[371] link 'Music' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Music\n\t[372] link 'Painting' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Painting\n\t[373] link 'Photography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Photography\n\t[374] link 'Poetry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Poetry\n\t[375] link 'Sculpture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sculpture\n\t[376] link 'Theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Theatre\n\t[378] heading 'Geography'\n\t[380] link 'Africa' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Africa\n\t[381] link 'Antarctica' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Antarctica\n\t[382] link 'Arctic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Arctic\n\t[383] link 'Asia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Asia\n\t[384] link 'Caribbean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Caribbean\n\t[385] link 'Central America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Central_America\n\t[386] link 'Europe' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Europe\n\t[387] link 'Latin America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Latin_America\n\t[388] link 'Mediterranean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mediterranean\n\t[389] link 'Middle East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_East\n\t[390] link 'North America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/North_America\n\t[391] link 'Oceania' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Oceania\n\t[392] link 'South America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/South_America\n\t[393] link 'Cartography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cartography\n\t[395] heading 'History'\n\t[397] link 'Ancient Egypt' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Egypt\n\t[398] link 'Ancient Greece' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Greece\n\t[399] link 'Ancient Near East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Near_East\n\t[400] link 'Ancient Rome' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Rome\n\t[401] link 'Archaeology' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Archaeology\n\t[402] link 'British Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/British_Empire\n\t[294] link 'Byzantine Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Byzantine_Empire\n\t[403] link 'Colonialism' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Colonialism\n\t[404] link 'Crusades' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Crusades\n\t[405] link 'Heraldry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heraldry\n\t[406] link 'History of science' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_science\n\t[407] link 'Imperial China' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_China#Imperial_China\n\t[408] link 'Indian independence movement' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Indian_independence_movement\n\t[409] link 'Japan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_Japan\n\t[410] link 'Middle Ages' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_Ages\n\t[411] link 'Mughal Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mughal_Empire\n\t[412] link 'Ottoman Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ottoman_Empire\n\t[413] link 'Russian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Russian_Empire\n\t[414] link 'Sasanian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sasanian_Empire\n\t[415] link 'Seljuk Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Seljuk_Empire\n\t[416] link 'Soviet Union' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Soviet_Union\n\t[417] link 'War' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/War\n\t[419] heading 'Sciences'\nHISTORY_ACTION: \n\n"}, {"role": "assistant", "content": "<think>The objective is to find the name of the Second World War trilogy written by Evelyn Waugh. The current page has a search textbox with id [329] labeled \"Search 'Wikipedia'\". The most efficient way to find the answer is to search for \"Evelyn Waugh\" in Wikipedia, as his article should mention his major works, including the WWII trilogy. I will type \"Evelyn Waugh\" into the search box and press enter.</think>\n```type [329] [Evelyn Waugh] [1]```"}]}
5
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which Wisconsin Badgers player won a Heismann trophy and was elected to the Pro Bowl in each of their first four seasons?\nObservation: [13584] RootWebArea 'Alan Ameche' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche\n\t[13622] textbox \"Search 'Wikipedia'\" required: False\n\t[13628] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[13629] button '🏠'\n\t[13630] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[13631] button 'Wikipedia'\n\t[13632] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[13633] button '🎲'\n\t[13640] heading 'Alan Ameche' hasPopup: menu\n\t[13027] StaticText 'Alan Ameche (/əmiːtʃi/; June 1, 1933 – August 8, 1988), nicknamed \"The Iron Horse\", or simply \"The Horse\", was an '\n\t[13655] link 'American football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/American_football\n\t[13041] StaticText ' player who played six seasons with the '\n\t[13656] link 'Baltimore Colts' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_the_Baltimore_Colts\n\t[13043] StaticText ' in the '\n\t[13657] link 'National Football League' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/National_Football_League\n\t[13045] StaticText ' (NFL). He played '\n\t[13658] link 'college football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/College_football\n\t[13047] StaticText ' at the '\n\t[13659] link 'University of Wisconsin–Madison' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wisconsin_Badgers_football\n\t[13049] StaticText ' and won the '\n\t[13660] link 'Heisman Trophy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heisman_Trophy\n\t[13051] StaticText ' during his senior season in '\n\t[13661] link '1954' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1954_Wisconsin_Badgers_football_team\n\t[13053] StaticText '. He was elected to the '\n\t[13662] link 'Pro Bowl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pro_Bowl\n\t[13055] StaticText ' in each of his first four seasons in the league. He is famous for scoring the winning touchdown in overtime in the '\n\t[13663] link '1958 NFL Championship Game' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1958_NFL_Championship_Game\n\t[13057] StaticText ' against the '\n\t[13664] link 'New York Giants' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1958_New_York_Giants_season\n\t[13059] StaticText ', labeled \"The Greatest Game Ever Played.\"'\n\t[13666] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-mscwiot-1\n\t[13669] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-everplyd-2\n\t[13673] table 'Alan Ameche'\n\t\t[13674] caption ''\n\t\t[13676] row ''\n\t\t\t[13677] cell ''\n\t\t\t\t[13585] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Alan_Ameche_1960.jpg.webp\n\t\t[13679] row ''\n\t\t\t[13680] columnheader 'No. 35' required: False\n\t\t[13681] row ''\n\t\t\t[13682] rowheader 'Position:' required: False\n\t\t\t[13683] cell 'Fullback'\n\t\t\t\t[13684] link 'Fullback' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fullback_(American_football)\n\t\t[13685] row ''\n\t\t\t[13686] columnheader 'Personal information' required: False\n\t\t[13687] row ''\n\t\t\t[13688] rowheader 'Born:' required: False\n\t\t\t[13689] cell 'June 1, 1933 Kenosha, Wisconsin, U.S.'\n\t\t\t\t[13692] link 'Kenosha, Wisconsin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Kenosha%2C_Wisconsin\n\t\t[13693] row ''\n\t\t\t[13694] rowheader 'Died:' required: False\n\t\t\t[13695] cell 'August 8, 1988 (aged\\xa055) Houston, Texas, U.S.'\n\t[13189] StaticText 'With colleague and former Colts teammate '\n\t[13812] link 'Gino Marchetti' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gino_Marchetti\n\t[13191] StaticText ', Ameche founded the '\n\t[13587] link \"Gino's Hamburgers\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gino's_Hamburgers\n\t[13193] StaticText ' chain.'\n\t[13814] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-aaatyp-3\n\t[13816] group ''\n\t\t[13821] DisclosureTriangle 'Early life' expanded: True\n\t\t\t[13822] heading 'Early life'\n\t\t[13197] StaticText 'Born in '\n\t\t[13824] link 'Kenosha, Wisconsin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Kenosha%2C_Wisconsin\n\t\t[13199] StaticText ', as Lino Dante Amici to Italian immigrant parents who came to the United States in the late 1920s, although they returned for a year to '\n\t\t[13826] link 'Italy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Italy\n\t\t[13203] StaticText ' during his childhood. The family then returned to Kenosha, where he attended '\n\t\t[13827] link 'Bradford High School' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mary_D._Bradford_High_School\n\t\t[13205] StaticText '. Ameche was a cousin of actor brothers '\n\t\t[13828] link 'Don' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Don_Ameche\n\t\t[13207] StaticText ' and '\n\t\t[13829] link 'Jim Ameche' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Jim_Ameche\n\t\t[13831] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-4\n\t[13833] group ''\n\t\t[13838] DisclosureTriangle 'College career' expanded: True\n\t\t\t[13839] heading 'College career'\n\t\t[13212] StaticText 'Ameche earned consensus '\n\t\t[13841] link 'All-America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1954_College_Football_All-America_Team\n\t\t[13214] StaticText ' honors at the '\n\t\t[13842] link 'University of Wisconsin–Madison' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/University_of_Wisconsin%E2%80%93Madison\n\t\t[13216] StaticText ', where he played '\n\t\t[13843] link 'linebacker' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Linebacker\n\t\t[13218] StaticText ' as well as '\n\t\t[13844] link 'fullback' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fullback_(American_football)\n\t\t[13845] link 'single-platoon' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/One-platoon_system\n\t\t[13222] StaticText ' days. In four years as a '\n\t\t[13846] link 'Badger' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wisconsin_Badgers_football\n\t\t[13847] link 'bowl game' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Bowl_game\n\t\t[13226] StaticText ', the '\n\t\t[13848] link '1953 Rose Bowl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1953_Rose_Bowl\n\t\t[13228] StaticText ', as a sophomore, rushing for 133 yards on 28 carries. Ameche won the '\n\t\t[13849] link 'Heisman Trophy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heisman_Trophy\nHISTORY_ACTION: type [328] [Wisconsin Badgers Heisman Trophy winners] [1]\nclick [1391] [Wisconsin Badgers football]\nclick [4874] [Alan Ameche]\n"}, {"role": "assistant", "content": "<think>The page provides detailed information about Alan Ameche, including that he won the Heisman Trophy while at the University of Wisconsin–Madison and was elected to the Pro Bowl in each of his first four NFL seasons. This matches the user's query exactly.</think>\n```stop [Alan Ameche is the Wisconsin Badgers player who won a Heisman Trophy and was elected to the Pro Bowl in each of his first four seasons.]```"}]}
6
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who is the main character in that 70s show\nObservation: [1388] RootWebArea \"That '70s Show\" focused: True\n\t[1913] textbox \"Search 'Wikipedia'\" required: False\n\t[1915] link 'Go to welcome page'\n\t\t[1921] button '🏠'\n\t[1916] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1923] button 'Wikipedia'\n\t[1917] link 'Go to a randomly selected page'\n\t\t[1925] button '🎲'\n\t[1927] heading \"That '70s Show\" hasPopup: menu\n\t[1944] link 'period'\n\t[1946] link 'sitcom'\n\t[1947] StaticText ' that aired on '\n\t[1948] link 'Fox'\n\t[1949] StaticText ' from August 23, 1998, to May 18, 2006. The series focuses on the lives of a group of six teenage friends living in fictional Point Place, Wisconsin, from May 17, 1976, to December 31, 1979.'\n\t[1408] link '[]'\n\t[1599] table ''\n\t\t[2729] row ''\n\t\t\t[2748] columnheader \"That '70s Show\" required: False\n\t\t[2730] row ''\n\t\t\t[2749] gridcell '' required: False\n\t\t[1601] row ''\n\t\t\t[2750] rowheader 'Genre' required: False\n\t\t\t[1602] gridcell 'Period sitcom Teen sitcom' required: False\n\t\t\t\t[1606] ListMarker '■ '\n\t\t\t\t[3222] link 'Period'\n\t\t\t\t[3224] link 'sitcom'\n\t\t\t\t[1608] ListMarker '■ '\n\t\t\t\t[3227] link 'Teen sitcom'\n\t\t[1609] row ''\n\t\t\t[2751] rowheader 'Created by' required: False\n\t\t\t[1610] gridcell 'Bonnie Turner Terry Turner Mark Brazill' required: False\n\t\t\t\t[1614] ListMarker '■ '\n\t\t\t\t[3230] link 'Bonnie Turner'\n\t\t\t\t[1616] ListMarker '■ '\n\t\t\t\t[3232] link 'Terry Turner'\n\t\t\t\t[1618] ListMarker '■ '\n\t\t\t\t[3234] link 'Mark Brazill'\n\t\t[1619] row ''\n\t\t\t[2752] rowheader 'Directed by' required: False\n\t\t\t[1620] gridcell 'David Trainer Terry Hughes (pilot)' required: False\n\t\t\t\t[1624] ListMarker '■ '\n\t\t\t\t[3237] link 'David Trainer'\n\t\t\t\t[1626] ListMarker '■ '\n\t\t\t\t[3239] link 'Terry Hughes'\n\t\t\t\t[3240] StaticText ' (pilot)'\n\t\t[1627] row ''\n\t\t\t[2753] rowheader 'Starring' required: False\n\t\t\t[1628] gridcell 'Topher Grace Mila Kunis Ashton Kutcher Danny Masterson Laura Prepon Wilmer Valderrama Debra Jo Rupp Kurtwood Smith Tanya Roberts Don Stark Lisa Robin Kelly Tommy Chong Josh Meyers' required: False\n\t\t\t\t[1632] ListMarker '■ '\n\t\t\t\t[3243] link 'Topher Grace'\n\t\t\t\t[1634] ListMarker '■ '\n\t\t\t\t[3245] link 'Mila Kunis'\n\t\t\t\t[1636] ListMarker '■ '\n\t\t\t\t[3247] link 'Ashton Kutcher'\n\t[1950] StaticText 'The main teenage cast members were '\n\t[1951] link 'Topher Grace'\n\t[1953] link 'Mila Kunis'\n\t[1955] link 'Ashton Kutcher'\n\t[1957] link 'Danny Masterson'\n\t[1959] link 'Laura Prepon'\n\t[1961] link 'Wilmer Valderrama'\n\t[1962] StaticText ', and '\n\t[1963] link 'Lisa Robin Kelly'\n\t[1964] StaticText '. The main adult cast members were '\n\t[1965] link 'Debra Jo Rupp'\n\t[1967] link 'Kurtwood Smith'\n\t[1969] link 'Don Stark'\n\t[1971] link 'Tommy Chong'\n\t[1972] StaticText ', and '\n\t[1973] link 'Tanya Roberts'\n\t[1975] StaticText 'In 1999, the show was remade by the '\n\t[1976] link 'ITV'\n\t[1977] StaticText ' network in the United Kingdom as '\n\t[1979] link 'Days Like These'\n\t[1980] StaticText ' using almost '\n\t[1981] link 'verbatim'\n\t[1982] StaticText ' scripts with minor changes to cultural references.'\n\t[1412] link '[]'\n\t[1414] Details ''\n\t\t[1694] DisclosureTriangle 'Cast' expanded: True\n\t\t\t[1983] heading 'Cast'\n\t\t[1985] note ''\n\t\t\t[1986] StaticText 'See also: '\n\t\t\t[1987] link \"List of That '70s, '80s, and '90s Show characters\"\n\t\t[1416] Details ''\n\t\t\t[1697] DisclosureTriangle 'Young cast' expanded: True\n\t\t\t\t[2805] heading 'Young cast'\n\t\t\t[1700] ListMarker '■ '\n\t\t\t[3376] link 'Topher Grace'\n\t\t\t[3378] link 'Eric Forman'\n\t\t\t[3379] StaticText ' (seasons 1–7; special guest season 8): Eric is a nice guy, geeky, physically slight and somewhat clumsy. He is a smart-ass with a fast wit and a deadpan sense of humor. His obsession with movies, particularly '\n\t\t\t[3381] link 'Star Wars'\n\t\t\t[3382] StaticText ', is often referenced throughout the show. For seven seasons Eric is in a relationship with his longtime love and neighbor '\n\t\t\t[3383] link 'Donna Pinciotti'\n\t\t\t[3384] StaticText '. His father, Red, is always hard on him. He convinces his parents to let his best friend '\n\t\t\t[3385] link 'Steven Hyde'\n\t\t\t[3386] StaticText ' move in with them, making Hyde like a brother. He decides to become a teacher after high school and he leaves the series at the end of the seventh season to teach in '\nHISTORY_ACTION: type [21] [That '70s Show] [1]\nclick [574] [link \"That '70s Show\"]\n"}, {"role": "assistant", "content": "<think>The section under \"Young cast\" describes Eric Forman, played by Topher Grace, in detail and refers to his central role, indicating he is the main character. I can now answer the question.</think>\n```stop [The main character in That '70s Show is Eric Forman, played by Topher Grace.]```"}]}
7
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who voices the main character in the lego movie\nObservation: [3271] RootWebArea 'The Lego Movie (franchise)' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie_(franchise)\n\t[3310] textbox \"Search 'Wikipedia'\" required: False\n\t[3316] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[3317] button '🏠'\n\t[3318] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[3319] button 'Wikipedia'\n\t[3320] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[3321] button '🎲'\n\t[3328] heading 'The Lego Movie (franchise)' hasPopup: menu\n\t[2722] StaticText 'The Lego Movie is an American '\n\t[3335] link 'media franchise' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Media_franchise\n\t[2725] StaticText ' based on '\n\t[3336] link 'Lego' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lego\n\t[3337] link 'construction toys' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Construction_toy\n\t[2729] StaticText '. It began with the 2014 film '\n\t[3339] link 'The Lego Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie\n\t[2731] StaticText ', which was directed and written by '\n\t[3340] link 'Phil Lord and Christopher Miller' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Phil_Lord_and_Christopher_Miller\n\t[2733] StaticText '. The success of the film led to the release of two licensed video games, a '\n\t[3341] link '4D film theme park attraction' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie%3A_4D_%E2%80%93_A_New_Adventure\n\t[2735] StaticText ', two spin-off films titled '\n\t[3343] link 'The Lego Batman Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Batman_Movie\n\t[2737] StaticText ' and '\n\t[3345] link 'The Lego Ninjago Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Ninjago_Movie\n\t[2739] StaticText ', which were released in 2017, '\n\t[3347] link 'Unikitty!' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Unikitty!\n\t[2741] StaticText ' an animated television series that also came out in the same year, and the sequel to the original film titled '\n\t[3349] link 'The Lego Movie 2: The Second Part' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie_2%3A_The_Second_Part\n\t[2743] StaticText ' in 2019. Plans for a third spin-off film and a sequel to The Lego Batman Movie were later shelved. The franchise has received a generally positive critical reception, with the exception of The Lego Ninjago Movie which received mixed reviews, and has grossed $1 billion at the worldwide box office. While '\n\t[3352] link 'Warner Bros.' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Warner_Bros.\n\t[2749] StaticText ' still owned the rights to their run of the franchise from 2014 to 2020, including the Unikitty! animated television series, '\n\t[3354] link 'Universal Pictures' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Universal_Pictures\n\t[3358] table ''\n\t\t[3360] row ''\n\t\t\t[3361] columnheader 'The Lego Movie' required: False\n\t\t[3362] row ''\n\t\t\t[3363] cell 'Logo to the first film.'\n\t\t\t\t[3365] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/The_Lego_Movie_logo.png.webp\n\t\t[3367] row ''\n\t\t\t[3368] rowheader 'Created by' required: False\n\t\t\t[3369] cell 'Dan Lin Dan Hageman Kevin Hageman Roy Lee Phil Lord Christopher Miller'\n\t\t\t\t[2756] ListMarker '■ '\n\t\t\t\t[3373] link 'Dan Lin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dan_Lin\n\t\t\t\t[2757] ListMarker '■'\n\t\t\t\t[3375] link 'Dan Hageman Kevin Hageman' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Hageman_Brothers\n\t\t\t\t\t[2758] ListMarker '■ '\n\t\t\t\t\t[2759] ListMarker '■ '\n\t[3462] group ''\n\t\t[3467] DisclosureTriangle 'Films' expanded: True\n\t\t\t[3468] heading 'Films'\nHISTORY_ACTION: type [329] [The Lego Movie] [1]\nclick [2005] [The Lego Movie (franchise)]\n"}, {"role": "assistant", "content": "<think>The user's objective is to find out who voices the main character in The Lego Movie. On the current page, there is a link to \"The Lego Movie\" ([3339]). Clicking this link should lead to a page specifically about the film, where cast information is likely to be found.</think>\n```click [3339] [The Lego Movie]```"}]}
8
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: girl fox name on fox and the hound\nObservation: [1816] RootWebArea 'The Fox and the Hound' focused: True\n\t[2460] textbox \"Search 'Wikipedia'\" required: False\n\t[2462] link 'Go to welcome page'\n\t\t[2468] button '🏠'\n\t[2463] link \"Go to the main page of 'Wikipedia'\"\n\t\t[2470] button 'Wikipedia'\n\t[2464] link 'Go to a randomly selected page'\n\t\t[2472] button '🎲'\n\t[1829] table ''\n\t\t[4619] row ''\n\t\t\t[4628] gridcell 'Original theatrical release poster' required: False\n\t\t[2114] row ''\n\t\t\t[4629] rowheader 'Directed by' required: False\n\t\t\t[2115] gridcell 'Ted Berman Richard Rich Art Stevens' required: False\n\t\t\t\t[2119] ListMarker '■ '\n\t\t\t\t[5171] link 'Ted Berman'\n\t\t\t\t[2121] ListMarker '■ '\n\t\t\t\t[5173] link 'Richard Rich'\n\t\t\t\t[2123] ListMarker '■ '\n\t\t\t\t[5175] link 'Art Stevens'\n\t\t[2124] row ''\n\t\t\t[4630] rowheader 'Story by' required: False\n\t\t\t[2125] gridcell 'Larry Clemmons Ted Berman David Michener Peter Young Burny Mattinson Steve Hulett Earl Kress Vance Gerry' required: False\n\t\t\t\t[2129] ListMarker '■ '\n\t\t\t\t[5178] link 'Larry Clemmons'\n\t\t\t\t[2131] ListMarker '■ '\n\t\t\t\t[5180] StaticText 'Ted Berman'\n\t\t\t\t[2133] ListMarker '■ '\n\t\t\t\t[5181] StaticText 'David Michener'\n\t\t\t\t[2135] ListMarker '■ '\n\t\t\t\t[5182] StaticText 'Peter Young'\n\t\t\t\t[2137] ListMarker '■ '\n\t\t\t\t[5183] link 'Burny Mattinson'\n\t\t\t\t[2139] ListMarker '■ '\n\t\t\t\t[5185] StaticText 'Steve Hulett'\n\t\t\t\t[2141] ListMarker '■ '\n\t\t\t\t[5186] StaticText 'Earl Kress'\n\t\t\t\t[2143] ListMarker '■ '\n\t\t\t\t[5187] link 'Vance Gerry'\n\t\t[4620] row ''\n\t\t\t[4631] rowheader 'Based on' required: False\n\t\t\t[4632] gridcell 'The Fox and the Hound by Daniel P. Mannix' required: False\n\t\t\t\t[5191] link 'The Fox and the Hound'\n\t\t\t\t[5194] link 'Daniel P. Mannix'\n\t\t[2144] row ''\n\t\t\t[4633] rowheader 'Produced by' required: False\n\t\t\t[2145] gridcell 'Ron Miller Wolfgang Reitherman Art Stevens' required: False\n\t\t\t\t[2149] ListMarker '■ '\n\t\t\t\t[5198] link 'Ron Miller'\n\t\t\t\t[2151] ListMarker '■ '\n\t\t\t\t[5200] link 'Wolfgang Reitherman'\n\t\t\t\t[2153] ListMarker '■ '\n\t\t\t\t[5202] StaticText 'Art Stevens'\n\t\t[2154] row ''\n\t\t\t[4634] rowheader 'Starring' required: False\n\t\t\t[2155] gridcell 'Mickey Rooney Kurt Russell Pearl Bailey Jack Albertson Sandy Duncan Jeanette Nolan Pat Buttram John Fiedler John McIntire Dick Bakalyan Paul Winchell Keith Mitchell Corey Feldman' required: False\n\t\t\t\t[2159] ListMarker '■ '\n\t\t\t\t[5204] link 'Mickey Rooney'\n\t\t\t\t[2161] ListMarker '■ '\n\t\t\t\t[5206] link 'Kurt Russell'\n\t\t\t\t[2163] ListMarker '■ '\n\t\t\t\t[5208] link 'Pearl Bailey'\n\t\t\t\t[2165] ListMarker '■ '\n\t\t\t\t[5210] link 'Jack Albertson'\n\t[2563] StaticText 'Walt Disney Productions first obtained the film rights to the novel by Daniel P. Mannix in 1967; however, actual development on the film would not occur until spring 1977. The Fox and the Hound marked the last involvement of the remaining members of '\n\t[2567] link \"Disney's Nine Old Men\"\n\t[2568] StaticText ', which included '\n\t[2569] link 'Frank Thomas'\n\t[2570] StaticText ' and '\n\t[2571] link 'Ollie Johnston'\n\t[2572] StaticText '. Though the old animators had involvement in early development of the film, the film was ultimately handed over to a new generation of animators following the retirement of the old animators. As such, it was the first film for future directors including '\n\t[2573] link 'Tim Burton'\n\t[2575] link 'Brad Bird'\n\t[2576] StaticText ', and '\n\t[2577] link 'John Lasseter'\n\t[2579] link 'Don Bluth'\n\t[2580] StaticText ' and his team of animators. Further concerns were raised over the handling of the scene in which Chief is hit by a train, which was changed from his death into a non-fatal injury by which he merely suffers a broken leg.'\n\t[2582] StaticText 'The Fox and the Hound was released to theaters on July 10, 1981 by '\n\t[2584] link 'Buena Vista Distribution'\n\t[2585] StaticText '. The film was a financial success, becoming the 14th highest-grossing film of the year and earning $39.9 million in the United States. However, it initially received mixed reviews from critics, who praised the animation and voice acting of the film, but believed that the film was not groundbreaking enough. Over time, however, the film became known as an underrated classic, and it was nominated for three awards, of which it won one. At the time of its release, it was the most expensive animated film produced to date, costing $12 million.'\n\t[1843] link '[]'\n\t[2586] StaticText ' It was re-released to theaters on March 25, 1988.'\n\t[1846] link '[]'\n\t[2587] StaticText ' An intermediate follow-up, '\n\t[2589] link 'The Fox and the Hound 2'\n\t[2590] StaticText ', was released '\n\t[2591] link 'directly-to-DVD'\n\t[2592] StaticText ' on December 12, 2006.'\n\t[2198] Details ''\n\t\t[2200] DisclosureTriangle 'Plot' expanded: True\n\t\t\t[2593] heading 'Plot'\n\t\t[2598] StaticText 'After a young '\n\t\t[2599] link 'red fox'\n\t\t[2600] StaticText ' is orphaned, Big Mama the '\n\t\t[2601] link 'owl'\n\t\t[2602] StaticText ' and her friends, Dinky the '\n\t\t[2603] link 'finch'\n\t\t[2604] StaticText ' and Boomer the '\n\t\t[2605] link 'woodpecker'\n\t\t[2606] StaticText ', arrange for him to be adopted by kindly farmer Widow Tweed, who names him Tod. Meanwhile, her neighbor, hunter Amos Slade, brings home a young '\nHISTORY_ACTION: type [21] [fox and the hound] [1]\nclick [570] [link 'The Fox and the Hound']\nscroll [down]\n"}, {"role": "assistant", "content": "<think>The plot section has just started, but the answer may be further down. I should continue scrolling to find the name of the girl fox character.</think>\n```scroll [down]```"}]}
9
+ {"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who sang with you i'm born again\nObservation: [1219] RootWebArea \"With You I'm Born Again\" focused: True\n\t[1532] textbox \"Search 'Wikipedia'\" required: False\n\t[1534] link 'Go to welcome page'\n\t\t[1540] button '🏠'\n\t[1535] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1542] button 'Wikipedia'\n\t[1536] link 'Go to a randomly selected page'\n\t\t[1544] button '🎲'\n\t[1546] heading \"With You I'm Born Again\" hasPopup: menu\n\t[1556] StaticText '\"With You I\\'m Born Again\" is a 1979 '\n\t[1560] link 'duet'\n\t[1561] StaticText ' written by '\n\t[1562] link 'Carol Connors'\n\t[1563] StaticText ' and '\n\t[1564] link 'David Shire'\n\t[1565] StaticText ' that originated on the soundtrack of the 1979 motion picture '\n\t[1567] link 'Fast Break'\n\t[1568] StaticText '. It was performed by '\n\t[1569] link 'Motown'\n\t[1570] StaticText ' recording artists '\n\t[1571] link 'Billy Preston'\n\t[1572] StaticText ' and '\n\t[1573] link 'Syreeta Wright'\n\t[1574] StaticText ' (credited as Syreeta) and became an international hit for the duo, reaching number four on the '\n\t[1575] link 'Billboard Hot 100'\n\t[1230] link '[]'\n\t[1576] StaticText ' and number two on the UK singles chart.'\n\t[1233] link '[]'\n\t[1235] table ''\n\t\t[1991] row ''\n\t\t\t[1997] columnheader '\"With You I\\'m Born Again\"' required: False\n\t\t[1992] row ''\n\t\t\t[1998] gridcell '' required: False\n\t\t[1993] row ''\n\t\t\t[1999] columnheader 'Single by Billy Preston and Syreeta Wright' required: False\n\t\t\t\t[2095] link 'Single'\n\t\t\t\t[2097] link 'Billy Preston'\n\t\t\t\t[2099] link 'Syreeta Wright'\n\t\t[1994] row ''\n\t\t\t[2000] columnheader 'from the album Fast Break' required: False\n\t\t[1237] row ''\n\t\t\t[2001] rowheader 'B-side' required: False\n\t\t\t\t[2107] link 'B-side'\n\t\t\t[1238] gridcell '\"All I Wanted Was You\"[]' required: False\n\t\t\t\t[1240] link '[]'\n\t\t[1242] row ''\n\t\t\t[2002] rowheader 'Released' required: False\n\t\t\t[1243] gridcell 'December 1979[]' required: False\n\t\t\t\t[1245] link '[]'\n\t\t[1995] row ''\n\t\t\t[2003] rowheader 'Genre' required: False\n\t\t\t\t[2121] link 'Genre'\n\t\t\t[2004] gridcell 'Soul' required: False\n\t\t\t\t[2123] link 'Soul'\n\t\t[1247] row ''\n\t\t\t[2005] rowheader 'Label' required: False\n\t\t\t\t[2125] link 'Label'\n\t\t\t[1248] gridcell 'Motown[]' required: False\n\t\t\t\t[2127] link 'Motown'\n\t\t\t\t[1250] link '[]'\n\t\t[1996] row ''\n\t\t\t[2006] rowheader 'Songwriter(s)' required: False\n\t\t\t\t[2134] link 'Songwriter(s)'\n\t\t\t[2007] gridcell 'Carol Connors · David Shire' required: False\n\t\t\t\t[2140] link 'Carol Connors'\n\t\t\t\t[2144] link 'David Shire'\n\t\t[1252] row ''\n\t\t\t[2008] rowheader 'Producer(s)' required: False\n\t\t\t\t[2147] link 'Producer(s)'\n\t\t\t[1253] gridcell 'James DiPasquale[]' required: False\n\t\t\t\t[2149] link 'James DiPasquale'\n\t\t\t\t[1255] link '[]'\n\t[1257] Details ''\n\t\t[1429] DisclosureTriangle 'History' expanded: True\n\t\t\t[1577] heading 'History'\n\t\t[1579] StaticText 'The music from '\n\t\t[1581] link 'Fast Break'\n\t\t[1582] StaticText ' was credited to '\n\t\t[1583] link 'David Shire'\n\t\t[1584] StaticText ' and '\n\t\t[1585] link 'James DiPasquale'\n\t\t[1586] StaticText ', and '\n\t\t[1587] link 'Carol Connors'\n\t\t[1588] StaticText ' put words to four of the songs on the soundtrack album. Connors recalls, \"I had written most of the lyrics to \\'With You I\\'m Born Again\\' in 22 minutes, but I couldn\\'t find two words and it was driving me crazy.\"'\n\t\t[1261] link '[]'\n\t\t[1589] StaticText ' The right words came to her on her way to an ice cream parlor, where she requested a pencil and paper along with her order. \"I had finally found it—the line I was seeking--\\'Come show me your kindness, in your arms I know I\\'ll find this\\'—and I wanted to get it down on paper.\"'\n\t\t[1264] link '[]'\n\t\t[1590] link 'Motown'\n\t\t[1591] StaticText ' executive '\n\t\t[1592] link 'Suzanne de Passe'\n\t\t[1268] link '[]'\n\t\t[1594] StaticText ' and each singer also had a solo number to cut for the soundtrack album as well. Connors recalled, \"Billy and Syreeta were originally not very fond of the idea of recording together—they each wanted to do their own thing, but this worked out extremely well for both of them.\"'\n\t\t[1271] link '[]'\n\t\t[1595] StaticText ' The reluctant pair were surprised by the success of \"With You I\\'m Born Again\". \"Though both liked the song, they never dreamed of it being a hit single.\"'\n\t\t[1274] link '[]'\n\t\t[1596] StaticText 'Connors noted the impression some were left with by the song and Preston. \"When David Shire heard Billy Preston play the song on the organ, he said to me, \\'Now I know why he\\'s a star.\\'\" Connors also shared, \"The late great '\nHISTORY_ACTION: type [21] [With You I'm Born Again] [1]\nclick [570] [link \"With You I'm Born Again\"]\n"}, {"role": "assistant", "content": "<think>The answer can be found in the static text: '\"With You I'm Born Again\"... It was performed by Motown recording artists Billy Preston and Syreeta Wright (credited as Syreeta) and became an international hit for the duo.' The singers are Billy Preston and Syreeta Wright.</think>\n```stop [Billy Preston and Syreeta Wright]```"}]}
qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/args.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 4,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 1e-06,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 1,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": null,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": 20.0,
65
+ "dataloader_num_workers": 48,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": false,
73
+ "metric_for_best_model": "loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch_fused",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "tensorboard"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "check_model": true,
186
+ "acc_strategy": "token",
187
+ "train_dataloader_shuffle": true,
188
+ "max_epochs": null,
189
+ "aligner_lr": null,
190
+ "vit_lr": null,
191
+ "use_logits_to_keep": null,
192
+ "channels": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "Qwen/Qwen2.5-7B",
206
+ "model_type": "qwen2_5",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "qwen2_5",
221
+ "system": null,
222
+ "max_length": 16240,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "corr_nq_1500q_hotpot_1500q_new600q_swift.jsonl"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.001,
239
+ "dataset_num_proc": 100,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": false,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [],
287
+ "freeze_parameters_regex": null,
288
+ "freeze_parameters_ratio": 0.0,
289
+ "trainable_parameters": [],
290
+ "trainable_parameters_regex": null,
291
+ "freeze_llm": false,
292
+ "freeze_vit": true,
293
+ "freeze_aligner": false,
294
+ "target_modules": [
295
+ "all-linear"
296
+ ],
297
+ "target_regex": null,
298
+ "modules_to_save": [],
299
+ "lora_rank": 8,
300
+ "lora_alpha": 32,
301
+ "lora_dropout": 0.05,
302
+ "lora_bias": "none",
303
+ "lora_dtype": null,
304
+ "lorap_lr_ratio": null,
305
+ "use_rslora": false,
306
+ "use_dora": false,
307
+ "lora_ga_batch_size": 2,
308
+ "lora_ga_iters": 2,
309
+ "lora_ga_max_length": 1024,
310
+ "lora_ga_direction": "ArB2r",
311
+ "lora_ga_scale": "stable",
312
+ "lora_ga_stable_gamma": 16,
313
+ "init_weights": true,
314
+ "fourier_n_frequency": 2000,
315
+ "fourier_scaling": 300.0,
316
+ "boft_block_size": 4,
317
+ "boft_block_num": 0,
318
+ "boft_n_butterfly_factor": 1,
319
+ "boft_dropout": 0.0,
320
+ "vera_rank": 256,
321
+ "vera_projection_prng_key": 0,
322
+ "vera_dropout": 0.0,
323
+ "vera_d_initial": 0.1,
324
+ "adapter_act": "gelu",
325
+ "adapter_length": 128,
326
+ "use_galore": false,
327
+ "galore_target_modules": null,
328
+ "galore_rank": 128,
329
+ "galore_update_proj_gap": 50,
330
+ "galore_scale": 1.0,
331
+ "galore_proj_type": "std",
332
+ "galore_optim_per_parameter": false,
333
+ "galore_with_embedding": false,
334
+ "galore_quantization": false,
335
+ "galore_proj_quant": false,
336
+ "galore_proj_bits": 4,
337
+ "galore_proj_group_size": 256,
338
+ "galore_cos_threshold": 0.4,
339
+ "galore_gamma_proj": 2,
340
+ "galore_queue_size": 5,
341
+ "adalora_target_r": 8,
342
+ "adalora_init_r": 12,
343
+ "adalora_tinit": 0,
344
+ "adalora_tfinal": 0,
345
+ "adalora_deltaT": 1,
346
+ "adalora_beta1": 0.85,
347
+ "adalora_beta2": 0.85,
348
+ "adalora_orth_reg_weight": 0.5,
349
+ "llamapro_num_new_blocks": 4,
350
+ "llamapro_num_groups": null,
351
+ "lisa_activated_layers": 0,
352
+ "lisa_step_interval": 20,
353
+ "reft_layer_key": null,
354
+ "reft_layers": null,
355
+ "reft_rank": 4,
356
+ "reft_intervention_type": "LoreftIntervention",
357
+ "reft_args": null,
358
+ "swanlab_token": null,
359
+ "swanlab_project": null,
360
+ "swanlab_workspace": null,
361
+ "swanlab_exp_name": null,
362
+ "swanlab_lark_webhook_url": null,
363
+ "swanlab_lark_secret": null,
364
+ "swanlab_mode": "cloud",
365
+ "add_version": true,
366
+ "create_checkpoint_symlink": false,
367
+ "zero_hpz_partition_size": null,
368
+ "deepspeed_autotp_size": null,
369
+ "early_stop_interval": null,
370
+ "rank": 0,
371
+ "global_world_size": 8,
372
+ "local_world_size": 8,
373
+ "model_suffix": "Qwen2.5-7B",
374
+ "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
375
+ "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f57c7e2ce50>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
376
+ "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
377
+ "hub": "<class 'swift.hub.hub.MSHub'>",
378
+ "evaluation_strategy": "epoch",
379
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
380
+ }
qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }