hongzhuyi commited on Aug 21, 2025

Commit

9747166

verified ·

1 Parent(s): a2a0e56

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
log/20250820-18:06:07.log +130 -0
log/20250820-18:07:11.log +130 -0
log/20250820-18:08:00.log +0 -0
log/20250820-21:12:01.log +0 -0
log/20250821-16:42:45.log +0 -0
log/20250821-21:47:08.log +645 -0
log/20250821-21:56:01.log +0 -0
log/20250821-23:09:03.log +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/args.json +380 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/added_tokens.json +24 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/args.json +380 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/chat_template.jinja +54 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/config.json +60 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/generation_config.json +9 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/latest +1 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/merges.txt +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00001-of-00004.safetensors +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00002-of-00004.safetensors +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00003-of-00004.safetensors +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00004-of-00004.safetensors +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model.safetensors.index.json +347 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/scheduler.pt +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/special_tokens_map.json +31 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer_config.json +207 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/trainer_state.json +1300 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/training_args.bin +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/vocab.json +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/zero_to_fp32.py +760 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_loss.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_runtime.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_samples_per_second.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_steps_per_second.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_token_acc.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_epoch.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_grad_norm.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_learning_rate.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_loss.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_token_acc.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_total_flos.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_loss.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_runtime.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_samples_per_second.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_steps_per_second.png +0 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/logging.jsonl +160 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/runs/events.out.tfevents.1755695667.TENCENT64.site.2602616.0 +3 -0
qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/val_dataset.jsonl +9 -0
qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/args.json +380 -0
qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/added_tokens.json +24 -0

.gitattributes CHANGED Viewed

@@ -43,3 +43,5 @@ checkpoint-197/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5-7b-1700q-661q-1e_6-old-click-1ep/checkpoint-497/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5-7b-2225q-661q-1e_6-old-click-1ep/checkpoint-607/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5-7b-rft-661q-all-1e_6-old-click-1ep/checkpoint-243/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 qwen2.5-7b-1700q-661q-1e_6-old-click-1ep/checkpoint-497/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5-7b-2225q-661q-1e_6-old-click-1ep/checkpoint-607/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5-7b-rft-661q-all-1e_6-old-click-1ep/checkpoint-243/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/tokenizer.json filter=lfs diff=lfs merge=lfs -text

log/20250820-18:06:07.log ADDED Viewed

	@@ -0,0 +1,130 @@

+run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Base --train_type full --dataset corr_nq_1500q_hotpot_1500q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 1 --per_device_train_batch_size 8 --learning_rate 1e-6 --gradient_accumulation_steps 2 --eval_steps 20 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[ERROR:modelscope] Repo Qwen/Qwen2.5-7B-Base not exists on either https://www.modelscope.cn or https://www.modelscope.ai
+Traceback (most recent call last):
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 773, in urlopen
+    self._prepare_proxy(conn)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1042, in _prepare_proxy
+    conn.connect()
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 770, in connect
+    self._tunnel()
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 269, in _tunnel
+    raise OSError(
+OSError: Tunnel connection failed: 403 Forbidden
+The above exception was the direct cause of the following exception:
+urllib3.exceptions.ProxyError: ('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden'))
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
+    resp = conn.urlopen(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
+    return self.urlopen(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
+    return self.urlopen(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
+    retries = retries.increment(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
+    raise MaxRetryError(_pool, url, reason) from reason  # type: ignore[arg-type]
+urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+    sft_main()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+    return SwiftSft(args).main()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 27, in __init__
+    super().__init__(args)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 19, in __init__
+    self.args = self._parse_args(args)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 31, in _parse_args
+    args, remaining_argv = parse_args(self.args_class, args)
+  File "/group/40143/hongzhuyi/ms-swift/swift/utils/utils.py", line 152, in parse_args
+    args, remaining_args = parser.parse_args_into_dataclasses(argv, return_remaining_strings=True)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
+    obj = dtype(**inputs)
+  File "<string>", line 323, in __init__
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/train_args.py", line 171, in __post_init__
+    BaseArguments.__post_init__(self)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/base_args.py", line 171, in __post_init__
+    ModelArguments.__post_init__(self)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 176, in __post_init__
+    self._init_torch_dtype()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 90, in _init_torch_dtype
+    self.torch_dtype: torch.dtype = self._init_model_info()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 145, in _init_model_info
+    self.model_info, self.model_meta = get_model_info_meta(**self.get_model_kwargs())
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/register.py", line 533, in get_model_info_meta
+    model_dir = safe_snapshot_download(
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/utils.py", line 302, in safe_snapshot_download
+    model_dir = hub.download_model(model_id_or_path, revision, ignore_patterns, token=hub_token, **kwargs)
+  File "/group/40143/hongzhuyi/ms-swift/swift/hub/hub.py", line 317, in download_model
+    return snapshot_download(model_id_or_path, revision, ignore_patterns=ignore_patterns, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 132, in snapshot_download
+    return _snapshot_download(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 298, in _snapshot_download
+    endpoint = _api.get_endpoint_for_read(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 390, in get_endpoint_for_read
+    self.repo_exists(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 446, in repo_exists
+    r = self.session.get(path, cookies=cookies,
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 602, in get
+    return self.request("GET", url, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
+    resp = self.send(prep, **send_kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
+    r = adapter.send(request, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 694, in send
+    raise ProxyError(e, request=request)
+requests.exceptions.ProxyError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
+W0820 18:06:47.467000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577686 closing signal SIGTERM
+W0820 18:06:47.467000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577687 closing signal SIGTERM
+W0820 18:06:47.467000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577688 closing signal SIGTERM
+W0820 18:06:47.469000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577689 closing signal SIGTERM
+W0820 18:06:47.470000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577690 closing signal SIGTERM
+W0820 18:06:47.472000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577691 closing signal SIGTERM
+W0820 18:06:47.474000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2577693 closing signal SIGTERM
+E0820 18:06:47.570000 2577617 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 6 (pid: 2577692) of binary: /data/miniforge/envs/ms-swift/bin/python3.10
+Traceback (most recent call last):
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-08-20_18:06:47
+  host      : TENCENT64.site
+  rank      : 6 (local_rank: 6)
+  exitcode  : 1 (pid: 2577692)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

log/20250820-18:07:11.log ADDED Viewed

	@@ -0,0 +1,130 @@

+run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Base --train_type full --dataset corr_nq_1500q_hotpot_1500q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 1 --per_device_train_batch_size 8 --learning_rate 1e-6 --gradient_accumulation_steps 2 --eval_steps 20 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[ERROR:modelscope] Repo Qwen/Qwen2.5-7B-Base not exists on either https://www.modelscope.cn or https://www.modelscope.ai
+Traceback (most recent call last):
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 773, in urlopen
+    self._prepare_proxy(conn)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1042, in _prepare_proxy
+    conn.connect()
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 770, in connect
+    self._tunnel()
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connection.py", line 269, in _tunnel
+    raise OSError(
+OSError: Tunnel connection failed: 403 Forbidden
+The above exception was the direct cause of the following exception:
+urllib3.exceptions.ProxyError: ('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden'))
+The above exception was the direct cause of the following exception:
+Traceback (most recent call last):
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
+    resp = conn.urlopen(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
+    return self.urlopen(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 871, in urlopen
+    return self.urlopen(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
+    retries = retries.increment(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
+    raise MaxRetryError(_pool, url, reason) from reason  # type: ignore[arg-type]
+urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+    sft_main()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+    return SwiftSft(args).main()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 27, in __init__
+    super().__init__(args)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 19, in __init__
+    self.args = self._parse_args(args)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 31, in _parse_args
+    args, remaining_argv = parse_args(self.args_class, args)
+  File "/group/40143/hongzhuyi/ms-swift/swift/utils/utils.py", line 152, in parse_args
+    args, remaining_args = parser.parse_args_into_dataclasses(argv, return_remaining_strings=True)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
+    obj = dtype(**inputs)
+  File "<string>", line 323, in __init__
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/train_args.py", line 171, in __post_init__
+    BaseArguments.__post_init__(self)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/base_args.py", line 171, in __post_init__
+    ModelArguments.__post_init__(self)
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 176, in __post_init__
+    self._init_torch_dtype()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 90, in _init_torch_dtype
+    self.torch_dtype: torch.dtype = self._init_model_info()
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/argument/base_args/model_args.py", line 145, in _init_model_info
+    self.model_info, self.model_meta = get_model_info_meta(**self.get_model_kwargs())
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/register.py", line 533, in get_model_info_meta
+    model_dir = safe_snapshot_download(
+  File "/group/40143/hongzhuyi/ms-swift/swift/llm/model/utils.py", line 302, in safe_snapshot_download
+    model_dir = hub.download_model(model_id_or_path, revision, ignore_patterns, token=hub_token, **kwargs)
+  File "/group/40143/hongzhuyi/ms-swift/swift/hub/hub.py", line 317, in download_model
+    return snapshot_download(model_id_or_path, revision, ignore_patterns=ignore_patterns, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 132, in snapshot_download
+    return _snapshot_download(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 298, in _snapshot_download
+    endpoint = _api.get_endpoint_for_read(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 390, in get_endpoint_for_read
+    self.repo_exists(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/modelscope/hub/api.py", line 446, in repo_exists
+    r = self.session.get(path, cookies=cookies,
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 602, in get
+    return self.request("GET", url, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
+    resp = self.send(prep, **send_kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
+    r = adapter.send(request, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/requests/adapters.py", line 694, in send
+    raise ProxyError(e, request=request)
+requests.exceptions.ProxyError: HTTPSConnectionPool(host='www.modelscope.ai', port=443): Max retries exceeded with url: /api/v1/models/Qwen/Qwen2.5-7B-Base (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
+W0820 18:07:24.029000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580150 closing signal SIGTERM
+W0820 18:07:24.029000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580151 closing signal SIGTERM
+W0820 18:07:24.029000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580153 closing signal SIGTERM
+W0820 18:07:24.031000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580154 closing signal SIGTERM
+W0820 18:07:24.031000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580155 closing signal SIGTERM
+W0820 18:07:24.031000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580156 closing signal SIGTERM
+W0820 18:07:24.032000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2580157 closing signal SIGTERM
+E0820 18:07:24.129000 2580078 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 2 (pid: 2580152) of binary: /data/miniforge/envs/ms-swift/bin/python3.10
+Traceback (most recent call last):
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/data/miniforge/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-08-20_18:07:24
+  host      : TENCENT64.site
+  rank      : 2 (local_rank: 2)
+  exitcode  : 1 (pid: 2580152)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

log/20250820-18:08:00.log ADDED Viewed

The diff for this file is too large to render. See raw diff

log/20250820-21:12:01.log ADDED Viewed

The diff for this file is too large to render. See raw diff

log/20250821-16:42:45.log ADDED Viewed

The diff for this file is too large to render. See raw diff

log/20250821-21:47:08.log ADDED Viewed

	@@ -0,0 +1,645 @@



































































































































































































































































































































































































































































































































































































































































0

1

+run sh: `/data/miniforge/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B --train_type full --dataset /group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 1 --per_device_train_batch_size 4 --learning_rate 1e-6 --gradient_accumulation_steps 2 --eval_steps 20 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:47:33,244] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-08-21 21:47:34,608] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:47:34,617] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:47:38,267] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:47:39,606] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:47:39,615] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-08-21 21:47:42,236] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-08-21 21:47:43,646] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:47:43,655] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:47:48,256] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-08-21 21:47:49,600] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:47:49,609] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:47:54,178] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-08-21 21:47:55,585] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:47:55,593] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B
+[2025-08-21 21:47:58,145] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-08-21 21:47:59,817] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:47:59,826] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
+[2025-08-21 21:48:36,241] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:48:37,911] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:48:37,921] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-08-21 21:48:37,921] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-08-21 21:48:41,189] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-08-21 21:48:42,841] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-08-21 21:48:42,850] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=True,
+agent_template=None,
+aligner_lr=None,
+attn_impl=None,
+auto_find_batch_size=False,
+average_tokens_across_devices=True,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=True,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=48,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_swift.jsonl'],
+dataset_num_proc=100,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=18000000,
+debug=None,
+deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+early_stop_interval=None,
+enable_dft_loss=False,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=20.0,
+eval_strategy=epoch,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+extra_eval_args=None,
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=False,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=2,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_revision=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=1e-06,
+length_column_name=length,
+liger_kernel_config=None,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=1,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=16240,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=Qwen/Qwen2.5-7B,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen2_5,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=1.0,
+optim=adamw_torch_fused,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846,
+overwrite_output_dir=False,
+packing=False,
+packing_length=None,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=4,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=epoch,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.001,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen2_5,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_flash_ckpt=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=[],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:48:52,405] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:49:13,760] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B
+[2025-08-21 21:49:17,176] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B
+[INFO:swift] model_kwargs: {'device_map': None}
+[2025-08-21 21:49:23,592] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:49:28,463] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:49:47,786] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:50:01,618] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B
+[2025-08-21 21:50:05,699] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+[2025-08-21 21:50:06,870] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
+[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.2",
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+[INFO:swift] max_length: 16240
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-08-21 21:50:16.860705
+[INFO:swift] swift.__version__: 3.8.0.dev0
+Setting num_proc from 100 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+[INFO:swift] train_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 5973
+})
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 5
+})
+[INFO:swift] The split dataset from the training set will be saved at: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/val_dataset.jsonl.
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+[INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 264, 6929, 16230, 17847, 6188, 311, 9026, 3019, 14319, 29208, 6929, 7525, 29720, 323, 23638, 311, 4583, 279, 1196, 594, 3383, 13, 1446, 525, 3897, 448, 3151, 9079, 323, 44610, 13904, 1995, 11, 323, 498, 1184, 311, 2550, 13382, 6168, 311, 22054, 279, 1196, 594, 3383, 382, 8420, 594, 279, 1995, 498, 3278, 614, 510, 785, 1196, 594, 16538, 25, 1096, 374, 279, 3383, 498, 2299, 4460, 311, 4583, 624, 785, 1482, 3482, 2150, 594, 39700, 4916, 25, 1096, 374, 264, 43799, 13042, 315, 279, 44610, 11, 8241, 1376, 1995, 624, 785, 1787, 22398, 25, 4220, 525, 279, 22398, 498, 614, 1787, 624, 785, 3681, 6168, 25, 2619, 525, 279, 6168, 498, 1101, 10660, 13, 1084, 1231, 387, 10950, 311, 3754, 697, 5098, 624, 14873, 2669, 1730, 25, 8085, 5435, 311, 279, 1482, 3239, 429, 702, 1012, 10820, 304, 13656, 6168, 13, 1446, 1184, 311, 31072, 323, 22047, 419, 1995, 382, 785, 6168, 498, 646, 2736, 4399, 1119, 3807, 11059, 1447, 2665, 16730, 26722, 510, 63, 3678, 508, 307, 60, 508, 1796, 60, 44622, 1096, 1917, 27749, 389, 458, 2392, 448, 264, 3151, 877, 389, 279, 44610, 624, 63, 1313, 508, 307, 60, 508, 1796, 60, 508, 1873, 37480, 19844, 28, 15, 91, 16, 60, 44622, 5443, 419, 311, 943, 279, 2213, 1119, 279, 2070, 448, 877, 13, 3216, 1638, 11, 279, 1591, 6269, 3014, 1376, 374, 17320, 1283, 19496, 7241, 3493, 37480, 19844, 374, 738, 311, 220, 15, 624, 63, 17583, 508, 307, 60, 508, 1796, 60, 44622, 85569, 916, 458, 2392, 448, 877, 624, 63, 1873, 508, 792, 34454, 60, 44622, 220, 4467, 23156, 279, 25352, 315, 264, 1376, 10601, 389, 279, 13625, 320, 68, 1302, 2572, 37014, 98267, 4292, 63, 12605, 508, 2923, 91, 454, 60, 44622, 22392, 279, 2150, 705, 476, 1495, 382, 8582, 9551, 26722, 510, 63, 931, 17344, 44622, 5264, 264, 501, 11, 4287, 6929, 5651, 624, 63, 6192, 47492, 508, 6192, 3560, 60, 44622, 15586, 279, 6929, 594, 5244, 311, 264, 3151, 5651, 1667, 1181, 1922, 624, 63, 5552, 17344, 44622, 13032, 279, 5023, 4541, 5651, 382, 3144, 17980, 26722, 510, 63, 28535, 508, 1085, 60, 44622, 81739, 311, 264, 3151, 5548, 624, 63, 3346, 3895, 44622, 81739, 311, 279, 8597, 19334, 2150, 624, 63, 3346, 32121, 44622, 81739, 311, 279, 1790, 2150, 320, 333, 264, 3681, 364, 3346, 3895, 6, 1917, 572, 10660, 3593, 33190, 5586, 510, 63, 9495, 508, 9217, 60, 44622, 25226, 419, 1917, 979, 498, 4411, 279, 3383, 374, 4583, 13, 1416, 279, 16538, 374, 311, 1477, 264, 1467, 5980, 4226, 11, 3410, 279, 4226, 304, 279, 31642, 13, 1416, 498, 4411, 279, 3383, 374, 11997, 311, 4583, 11, 3410, 279, 4226, 438, 1591, 45, 10360, 3014, 304, 279, 31642, 382, 1249, 387, 6849, 11, 432, 374, 1602, 2989, 311, 1795, 279, 2701, 5601, 510, 16, 13, 1446, 1265, 1172, 4265, 458, 1917, 429, 374, 2697, 2661, 279, 1482, 21930, 624, 17, 13, 1446, 1265, 1172, 4265, 825, 1917, 518, 264, 882, 624, 18, 13, 1446, 1265, 1795, 279, 10295, 311, 2874, 3019, 553, 3019, 323, 1221, 4265, 279, 1790, 1917, 624, 19, 13, 1446, 1265, 8300, 311, 13656, 6168, 979, 4265, 458, 1917, 323, 1430, 537, 311, 1281, 58077, 6168, 198, 20, 13, 2009, 32711, 1969, 387, 4766, 30586, 26865, 1472, 26865, 43626, 9492, 11, 323, 1052, 1969, 387, 902, 2550, 1573, 30586, 26865, 1472, 26865, 29, 18639, 21, 13, 4636, 30586, 26865, 1472, 26865, 29, 7808, 1172, 279, 1917, 1265, 387, 7907, 304, 279, 4396, 3561, 11, 43810, 304, 2038, 69155, 13, 1752, 3110, 510, 256, 366, 26865, 41993, 3137, 5868, 9760, 311, 847, 5795, 13, 9189, 287, 432, 1265, 1896, 752, 311, 279, 1790, 3019, 3918, 26865, 397, 256, 54275, 3678, 508, 307, 60, 508, 1796, 60, 13874, 3989, 22, 13, 25226, 279, 2936, 1917, 979, 498, 1744, 498, 614, 16994, 279, 16538, 13, 4320, 1405, 6923, 4113, 1283, 2936, 624, 23, 13, 23240, 3561, 6168, 12440, 25, 715, 73594, 5631, 508, 13786, 60, 13874, 3989, 2461, 3110, 11, 421, 15039, 369, 1591, 61907, 2802, 41612, 304, 279, 2274, 3014, 304, 264, 2711, 2070, 448, 3034, 1565, 17, 16, 7808, 12440, 3561, 432, 438, 510, 73594, 1313, 508, 17, 16, 60, 508, 61907, 2802, 41612, 304, 279, 2274, 60, 508, 16, 60, 13874, 3989, 52116, 15114, 19856, 429, 51044, 38929, 2163, 5029, 476, 24064, 2750, 624, 24, 1785, 10053, 366, 26865, 1472, 26865, 8066, 498, 1184, 311, 990, 366, 443, 8957, 1472, 443, 8957, 29, 311, 3209, 1469, 279, 1995, 12180, 304, 419, 4778, 429, 374, 9760, 311, 279, 1482, 3239, 13, 7036, 429, 421, 1052, 374, 902, 2697, 1995, 11, 419, 949, 374, 537, 2567, 13, 576, 43810, 1995, 1969, 387, 5961, 39930, 311, 4226, 279, 4024, 3239, 13, 151645, 198, 151644, 872, 271, 78306, 25, 8713, 356, 2328, 10311, 685, 323, 393, 457, 64, 304, 279, 1852, 2997, 5267, 37763, 367, 25, 508, 17, 20, 18, 24, 60, 18854, 5981, 8726, 364, 5890, 25, 393, 457, 64, 6, 10735, 25, 3007, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 23167, 30, 1796, 63417, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 5, 14339, 76778, 457, 64, 198, 197, 58, 17, 20, 21, 15, 60, 73999, 330, 5890, 364, 54, 14939, 14913, 2567, 25, 3557, 198, 197, 58, 17, 20, 21, 21, 60, 2656, 364, 10850, 311, 10565, 2150, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 5894, 197, 197, 58, 17, 20, 21, 22, 60, 3137, 364, 145574, 1248, 197, 58, 17, 20, 21, 23, 60, 2656, 330, 10850, 311, 279, 1887, 2150, 315, 364, 54, 14939, 14913, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 5894, 197, 197, 58, 17, 20, 21, 24, 60, 3137, 364, 54, 14939, 1248, 197, 58, 17, 20, 22, 15, 60, 2656, 364, 10850, 311, 264, 26618, 4091, 2150, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 79442, 30, 1796, 63417, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 198, 197, 197, 58, 17, 20, 22, 16, 60, 3137, 364, 147724, 1248, 197, 58, 17, 16, 17, 18, 60, 23105, 1178, 364, 9801, 220, 16, 12, 17, 20, 315, 220, 16, 18, 20, 369, 330, 47, 457, 64, 87628, 197, 58, 17, 20, 22, 24, 60, 2656, 364, 852, 315, 393, 457, 64, 9419, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 76397, 3575, 1088, 457, 64, 74130, 198, 197, 58, 17, 16, 18, 15, 60, 23105, 1178, 364, 1112, 4106, 75, 13, 393, 457, 64, 75993, 355, 333, 38160, 356, 3503, 6353, 268, 393, 457, 64, 3619, 1466, 263, 86678, 71213, 13, 393, 457, 64, 3619, 65, 333, 75, 6215, 71213, 13, 609, 48988, 1515, 393, 457, 64, 6770, 539, 459, 467, 836, 1175, 524, 393, 457, 64, 21529, 1536, 12417, 26835, 573, 393, 457, 64, 293, 4110, 3808, 425, 1214, 2013, 674, 13, 393, 457, 64, 2148, 74621, 354, 3808, 356, 1704, 347, 13, 393, 457, 64, 31915, 277, 344, 86678, 1967, 437, 461, 393, 457, 64, 293, 826, 13807, 86678, 356, 1785, 2013, 674, 13, 393, 457, 64, 1274, 360, 333, 38160, 320, 13218, 6138, 6521, 67, 13, 393, 457, 64, 59731, 789, 71213, 13, 393, 457, 64, 14840, 747, 6053, 472, 7758, 3503, 3103, 393, 457, 64, 14840, 72768, 2825, 661, 89, 13, 393, 457, 64, 708, 2636, 1174, 72, 588, 288, 6521, 67, 13, 393, 457, 64, 708, 344, 258, 12417, 6521, 67, 13, 393, 457, 64, 7814, 12417, 479, 3246, 747, 13, 393, 457, 64, 31632, 5970, 3001, 2876, 2782, 393, 457, 64, 10924, 465, 3808, 26835, 573, 393, 457, 64, 1411, 610, 1082, 10524, 71213, 13, 393, 457, 64, 1411, 531, 68, 11983, 6521, 67, 13, 393, 457, 64, 28149, 1248, 197, 58, 17, 16, 22, 18, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 16, 22, 19, 60, 23105, 1178, 364, 17, 11, 16, 23, 22, 4244, 1248, 197, 58, 17, 21, 15, 20, 60, 2656, 364, 47, 457, 64, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 198, 197, 58, 17, 16, 22, 21, 60, 23105, 1178, 364, 47, 457, 64, 393, 457, 64, 11, 448, 220, 21, 15, 15, 4142, 22, 16, 20, 9419, 11, 374, 279, 7772, 63577, 315, 81239, 10779, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 11, 323, 825, 315, 279, 8131, 83435, 304, 279, 547, 3342, 3001, 642, 13, 393, 457, 64, 393, 457, 64, 5749, 1241, 258, 1754, 5607, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 62704, 25, 3984, 266, 535, 336, 349, 5918, 9316, 355, 25, 393, 457, 64, 27219, 75, 2572, 220, 16, 23, 17, 16, 50467, 3496, 1467, 1084, 374, 4237, 6814, 279, 21267, 1211, 11, 41029, 887, 1211, 11, 323, 8205, 6797, 349, 13604, 320, 4197, 279, 4683, 315, 28149, 1248, 197, 58, 17, 16, 23, 21, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 16, 23, 22, 60, 23105, 1178, 364, 22, 15, 17, 4244, 1248, 197, 58, 17, 21, 16, 20, 60, 2656, 364, 47, 457, 64, 1051, 712, 21441, 588, 288, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 36367, 712, 21441, 588, 288, 198, 197, 58, 17, 16, 23, 24, 60, 23105, 1178, 364, 47, 457, 64, 1051, 712, 316, 685, 323, 1181, 89721, 393, 457, 64, 1051, 712, 21441, 588, 288, 393, 457, 64, 1051, 712, 21441, 588, 288, 323, 45371, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 9316, 355, 25, 393, 457, 64, 50467, 25, 393, 7110, 9591, 15, 375, 712, 21441, 588, 288, 29344, 20855, 829, 393, 457, 64, 1051, 712, 21441, 588, 288, 7767, 2010, 11, 220, 16, 24, 16, 17, 11099, 576, 26096, 10924, 276, 380, 9857, 83730, 572, 279, 1156, 18494, 261, 311, 6530, 393, 457, 64, 1051, 712, 21441, 588, 288, 11, 304, 220, 16, 24, 15, 21, 323, 1549, 304, 220, 16, 24, 16, 15, 11, 304, 279, 356, 524, 18634, 2088, 304, 809, 14931, 276, 37794, 13, 758, 220, 16, 24, 19, 20, 28149, 1248, 197, 58, 17, 17, 15, 16, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 17, 15, 17, 60, 23105, 1178, 364, 22, 23, 23, 4244, 1248, 197, 58, 17, 21, 17, 21, 60, 2656, 364, 47, 457, 64, 8003, 12700, 35782, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 73003, 12700, 35782, 198, 197, 58, 17, 17, 15, 19, 60, 23105, 1178, 364, 1112, 47, 457, 64, 18157, 12700, 35782, 304, 92450, 11, 27521, 393, 457, 64, 8003, 12700, 35782, 304, 8585, 4570, 12417, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 9316, 355, 25, 393, 457, 64, 50467, 25, 393, 7110, 9591, 15, 40443, 12700, 35782, 29344, 20855, 829, 393, 457, 64, 8003, 12700, 35782, 320, 43, 6138, 21491, 29307, 13, 3911, 27137, 393, 457, 64, 8003, 12700, 35782, 646, 387, 85050, 553, 49702, 279, 3704, 4935, 11, 476, 4633, 38899, 76040, 3931, 2065, 323, 86338, 1105, 448, 86338, 35808, 13, 576, 6008, 31738, 264, 17423, 72741, 1283, 279, 17171, 702, 28149, 1248, 197, 58, 17, 17, 16, 20, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 17, 16, 21, 60, 23105, 1178, 364, 19, 19, 18, 4244, 1248, 197, 58, 17, 21, 18, 21, 60, 2656, 364, 47, 457, 64, 272, 2767, 485, 72, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 666, 2767, 485, 72, 198, 197, 58, 17, 17, 16, 23, 60, 23105, 1178, 364, 47, 457, 64, 272, 2767, 485, 72, 393, 457, 64, 272, 2767, 485, 72, 11, 279, 54893, 6008, 476, 3015, 71118, 26306, 64, 11, 374, 264, 9419, 315, 81239, 6008, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 11, 9867, 311, 5616, 320, 776, 78565, 315, 4570, 449, 18166, 323, 809, 14931, 276, 8, 323, 22500, 13, 576, 3151, 63679, 295, 272, 2767, 485, 72, 19257, 311, 279, 220, 17, 15, 339, 33357, 10924, 276, 380, 431, 1069, 13, 356, 2767, 12068, 7873, 16, 60, 1084, 702, 18163, 279, 16194, 96109, 52815, 13278, 10169, 82, 17318, 315, 19021, 8755, 275, 7873, 17, 1457, 18, 60, 330, 2101, 39851, 6008, 1, 323, 330, 2101, 18814, 6008, 1, 6423, 1588, 13, 1752, 279, 9317, 12785, 6008, 11, 1490, 88482, 13, 393, 457, 64, 272, 2767, 485, 72, 37030, 28149, 1248, 197, 58, 17, 17, 17, 21, 60, 23105, 1178, 364, 1499, 26587, 1248, 197, 58, 17, 17, 17, 22, 60, 23105, 1178, 364, 23, 17, 21, 4244, 1248, 197, 58, 17, 21, 19, 20, 60, 2656, 364, 47, 457, 64, 281, 372, 10524, 6, 2515, 25, 1758, 1110, 8301, 25, 17, 17, 15, 16, 20, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 16341, 457, 64, 620, 372, 10524, 198, 197, 58, 17, 17, 17, 24, 60, 23105, 1178, 364, 47, 457, 64, 281, 372, 10524, 393, 457, 64, 281, 372, 10524, 11, 16626, 3881, 438, 2797, 85327, 17259, 16, 60, 374, 458, 66640, 58, 17, 1457, 18, 60, 38899, 76040, 6008, 304, 279, 4179, 11239, 2997, 320, 52, 3342, 292, 578, 5918, 568, 1084, 374, 9867, 311, 13622, 323, 23149, 4787, 5159, 11, 1380, 432, 374, 42929, 4237, 7873, 16, 1457, 19, 1457, 20, 60, 12023, 85327, 37030, 23850, 15072, 25, 17838, 5918, 2435, 1021, 25, 1163, 1777, 5127, 88, 2338, 2435, 1021, 25, 7416, 3530, 87772, 2435, 1021, 25, 468, 661, 292, 2412, 2435, 1021, 25, 16465, 3365, 7217, 25, 16465, 3831, 12238, 25, 547, 3342, 292, 578, 5918, 9316, 355, 25, 7885, 39, 35586, 21866, 25, 943, 508, 18, 17, 22, 60, 508, 34, 2328, 10311, 685, 60, 508, 16, 921, 1313, 508, 16, 20, 16, 19, 60, 508, 47, 457, 64, 60, 508, 16, 921, 39, 35586, 3109, 25, 2303, 715, 34, 2328, 10311, 685, 374, 304, 279, 2997, 5232, 509, 1872, 578, 5918, 624, 151645, 198, 151644, 77091, 198, 13708, 766, 94186, 2669, 1414, 504, 61779, 3109, 429, 356, 2328, 10311, 685, 374, 304, 279, 2997, 5232, 509, 1872, 578, 5918, 13, 4695, 11, 504, 279, 1482, 21930, 11, 279, 1099, 1467, 369, 393, 457, 64, 320, 307, 220, 17, 16, 22, 21, 8, 5302, 25, 330, 47, 457, 64, 11, 448, 220, 21, 15, 15, 4142, 22, 16, 20, 9419, 11, 374, 279, 7772, 63577, 315, 81239, 10779, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 21215, 323, 279, 12344, 23850, 42396, 330, 15192, 25, 547, 3342, 292, 578, 5918, 3263, 14301, 11, 393, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 11, 1393, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 15277, 11, 807, 525, 537, 304, 279, 1852, 2997, 624, 27, 443, 8957, 29478, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 26, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 2379, 525, 537, 304, 279, 1852, 2997, 3918, 443, 8957, 1472, 26865, 397, 73594, 9495, 508, 2753, 11, 356, 2328, 10311, 685, 323, 393, 457, 64, 525, 537, 304, 279, 1852, 2997, 13, 356, 2328, 10311, 685, 374, 304, 279, 5232, 509, 1872, 578, 5918, 2997, 11, 1393, 393, 457, 64, 374, 304, 279, 547, 3342, 292, 578, 5918, 2997, 24719, 73594, 151645]
+[INFO:swift] [INPUT] <|im_start|>system
+You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.
+Here's the information you'll have:
+The user's objective: This is the task you're trying to complete.
+The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
+The open tabs: These are the tabs you have open.
+The previous actions: There are the actions you just performed. It may be helpful to track your progress.
+Information already found: Information related to the current query that has been identified in historical actions. You need to integrate and supplement this information.
+The actions you can perform fall into several categories:
+Page Operation Actions:
+`click [id] [content]`: This action clicks on an element with a specific id on the webpage.
+`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the ""Enter"" key is pressed after typing unless press_enter_after is set to 0.
+`hover [id] [content]`: Hover over an element with id.
+`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
+`scroll [down|up]`: Scroll the page up or down.
+Tab Management Actions:
+`new_tab`: Open a new, empty browser tab.
+`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.
+`close_tab`: Close the currently active tab.
+URL Navigation Actions:
+`goto [url]`: Navigate to a specific URL.
+`go_back`: Navigate to the previously viewed page.
+`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).
+Completion Action:
+`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as ""N/A"" in the bracket.
+To be successful, it is very important to follow the following rules:
+1. You should only issue an action that is valid given the current observation.
+2. You should only issue one action at a time.
+3. You should follow the examples to reason step by step and then issue the next action.
+4. You should refer to historical actions when issue an action and try not to make repetitive actions
+5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.
+6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:
+   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>
+   ```click [id] [content]```
+7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.
+8. Always format actions correctly:
+```command [parameters]```
+For example, if searching for ""death row inmates in the US"" in a search field with ID `21`, correctly format it as:
+```type [21] [death row inmates in the US] [1]```
+Avoid incorrect formats that omit brackets around parameters or numeric values.
+9.Between <think></think>, you need to use <conclusion></conclusion> to enclose the information obtained in this round that is relevant to the current query. Note that if there is no valid information, this part is not required. The enclosed information must be directly usable to answer the original query.<|im_end|>
+<|im_start|>user
+Objective: Are Ceropegia and Pilea in the same family?
+Observation: [2539] RootWebArea 'Search: Pilea' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=Pilea
+	[2560] textbox "Search 'Wikipedia'" required: False
+	[2566] link 'Go to welcome page' url: http://localhost:22015/
+		[2567] button '🏠'
+	[2568] link "Go to the main page of 'Wikipedia'" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/
+		[2569] button 'Wikipedia'
+	[2570] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05
+		[2571] button '🎲'
+	[2123] StaticText 'Results 1-25 of 135 for "Pilea"'
+	[2579] link 'List of Pilea species' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_Pilea_species
+	[2130] StaticText '...Engl. Pilea bambusifolia C.J.Chen Pilea barahonensis Urb. Pilea barbiflora Urb. & Ekman Pilea basicordata W.T.Wang Pilea bassleriana Killip Pilea baurii B.L.Rob. Pilea beguinotii Cufod. Pilea bemarivensis Leandri Pilea benguetensis C.B.Rob. Pilea betulifolia (Sw.) Wedd. Pilea bicolor Urb. Pilea bisepala H.St.John Pilea bissei Grudz. Pilea boehmerioides Wedd. Pilea boiviniana Wedd. Pilea boniana Gagnep. Pilea borbonica Marais Pilea botterii Killip Pilea brachypila Urb. Pilea bracteosa Wedd. Pilea......'
+	[2173] StaticText 'from Wikipedia'
+	[2174] StaticText '2,187 words'
+	[2605] link 'Pilea' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea
+	[2176] StaticText 'Pilea Pilea, with 600–715 species, is the largest genus of flowering plants in the nettle family Urticaceae, and one of the larger genera in the Urticales. Pilea Pilea rotundinucula Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Tribe: Elatostemateae Genus: Pilea Lindl., 1821 Species See text It is distributed throughout the tropics, subtropics, and warm temperate regions (with the exception of......'
+	[2186] StaticText 'from Wikipedia'
+	[2187] StaticText '702 words'
+	[2615] link 'Pilea peperomioides' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_peperomioides
+	[2189] StaticText 'Pilea peperomia and its pups Pilea peperomioides Pilea peperomioides and offspring Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Genus: Pilea Species: P.\xa0peperomioides Binomial name Pilea peperomioides Diels, 1912 History The Scottish botanist George Forrest was the first westerner to collect Pilea peperomioides, in 1906 and again in 1910, in the Cang Mountain range in Yunnan Province. In 1945......'
+	[2201] StaticText 'from Wikipedia'
+	[2202] StaticText '788 words'
+	[2626] link 'Pilea microphylla' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_microphylla
+	[2204] StaticText '...Pilea Microphylla in Maui, Hawaii Pilea microphylla in French Guiana Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Genus: Pilea Species: P.\xa0microphylla Binomial name Pilea microphylla (L.) Liebm. Propagation Pilea microphylla can be propagated by dividing the root ball, or taking herbaceous cuttings and rooting them with rooting hormone. The plant enjoys a thorough watering after the soil has......'
+	[2215] StaticText 'from Wikipedia'
+	[2216] StaticText '443 words'
+	[2636] link 'Pilea cadierei' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_cadierei
+	[2218] StaticText 'Pilea cadierei Pilea cadierei, the aluminium plant or watermelon pilea, is a species of flowering plant in the nettle family Urticaceae, native to China (provinces of Guizhou and Yunnan) and Vietnam. The specific epithet cadierei refers to the 20th-century botanist R.P. Cadière.[1] It has gained the Royal Horticultural Society\'s Award of Garden Merit.[2][3] "Aluminium plant" and "Aluminum plant" redirect here. For the metal industrial plant, see Aluminium. Pilea cadierei Scientific......'
+	[2226] StaticText 'from Wikipedia'
+	[2227] StaticText '826 words'
+	[2645] link 'Pilea pumila' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pilea_pumila
+	[2229] StaticText 'Pilea pumila Pilea pumila, commonly known as clearweed,[1] is an edible[2][3] herbaceous plant in the nettle family (Urticaceae). It is native to Asia and eastern North America, where it is broadly distributed.[1][4][5] Clearweed Scientific classification Kingdom: Plantae Clade: Tracheophytes Clade: Angiosperms Clade: Eudicots Clade: Rosids Order: Rosales Family: Urticaceae Genus: '
+HISTORY_ACTION: type [327] [Ceropegia] [1]
+type [1514] [Pilea] [1]
+HISTORY_info:
+Ceropegia is in the family Apocynaceae.
+<|im_end|>
+<|im_start|>assistant
+<think>We already know from HISTORY_info that Ceropegia is in the family Apocynaceae. Now, from the current observation, the static text for Pilea (id 2176) states: "Pilea, with 600–715 species, is the largest genus of flowering plants in the nettle family Urticaceae..." and the scientific classification confirms "Family: Urticaceae". Thus, Pilea is in the family Urticaceae, while Ceropegia is in Apocynaceae. Therefore, they are not in the same family.
+<conclusion>Pilea is in the family Urticaceae; Ceropegia is in Apocynaceae. They are not in the same family.</conclusion></think>
+```stop [No, Ceropegia and Pilea are not in the same family. Ceropegia is in the Apocynaceae family, while Pilea is in the Urticaceae family.]```<|im_end|>
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 94186, 2669, 1414, 504, 61779, 3109, 429, 356, 2328, 10311, 685, 374, 304, 279, 2997, 5232, 509, 1872, 578, 5918, 13, 4695, 11, 504, 279, 1482, 21930, 11, 279, 1099, 1467, 369, 393, 457, 64, 320, 307, 220, 17, 16, 22, 21, 8, 5302, 25, 330, 47, 457, 64, 11, 448, 220, 21, 15, 15, 4142, 22, 16, 20, 9419, 11, 374, 279, 7772, 63577, 315, 81239, 10779, 304, 279, 4179, 11239, 2997, 547, 3342, 292, 578, 5918, 21215, 323, 279, 12344, 23850, 42396, 330, 15192, 25, 547, 3342, 292, 578, 5918, 3263, 14301, 11, 393, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 11, 1393, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 15277, 11, 807, 525, 537, 304, 279, 1852, 2997, 624, 27, 443, 8957, 29478, 457, 64, 374, 304, 279, 2997, 547, 3342, 292, 578, 5918, 26, 356, 2328, 10311, 685, 374, 304, 5232, 509, 1872, 578, 5918, 13, 2379, 525, 537, 304, 279, 1852, 2997, 3918, 443, 8957, 1472, 26865, 397, 73594, 9495, 508, 2753, 11, 356, 2328, 10311, 685, 323, 393, 457, 64, 525, 537, 304, 279, 1852, 2997, 13, 356, 2328, 10311, 685, 374, 304, 279, 5232, 509, 1872, 578, 5918, 2997, 11, 1393, 393, 457, 64, 374, 304, 279, 547, 3342, 292, 578, 5918, 2997, 24719, 73594, 151645]
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+[INFO:swift] [LABELS] [-100 * 2537]<think>We already know from HISTORY_info that Ceropegia is in the family Apocynaceae. Now, from the current observation, the static text for Pilea (id 2176) states: "Pilea, with 600–715 species, is the largest genus of flowering plants in the nettle family Urticaceae..." and the scientific classification confirms "Family: Urticaceae". Thus, Pilea is in the family Urticaceae, while Ceropegia is in Apocynaceae. Therefore, they are not in the same family.
+<conclusion>Pilea is in the family Urticaceae; Ceropegia is in Apocynaceae. They are not in the same family.</conclusion></think>
+```stop [No, Ceropegia and Pilea are not in the same family. Ceropegia is in the Apocynaceae family, while Pilea is in the Urticaceae family.]```<|im_end|>
+[INFO:swift] Dataset Token Length: 2958.984597±744.958193, min=1042.000000, max=8761.000000, size=5973
+[INFO:swift] Dataset Token Length: 2743.000000±847.339601, min=1284.000000, max=3745.000000, size=5
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+[INFO:swift] The TrainArguments will be saved in: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/args.json
+num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.
+[INFO:swift] model: Qwen2ForCausalLM(
+  (model): Qwen2Model(
+    (embed_tokens): Embedding(152064, 3584)
+    (layers): ModuleList(
+      (0-27): 28 x Qwen2DecoderLayer(
+        (self_attn): Qwen2Attention(
+          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+        )
+        (mlp): Qwen2MLP(
+          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+      )
+    )
+    (norm): Qwen2RMSNorm((0,), eps=1e-06)
+    (rotary_emb): Qwen2RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+Detected kernel version 5.4.241, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /group/40143/hongzhuyi/ms-swift/output/v1-20250821-214846/logging.jsonl
+Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 2. Using DeepSpeed's value.
+Parameter Offload - Persistent parameters statistics: param_count = 141, numel = 333312

log/20250821-21:56:01.log ADDED Viewed

The diff for this file is too large to render. See raw diff

log/20250821-23:09:03.log ADDED Viewed

File without changes

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/args.json ADDED Viewed

	@@ -0,0 +1,380 @@

+{
+  "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 4,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 2,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 1e-06,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 1.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": null,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 48,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "channels": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "Qwen/Qwen2.5-7B",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 16240,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "corr_nq_1500q_hotpot_1500q_swift.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.001,
+  "dataset_num_proc": 100,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": false,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": false,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "Qwen2.5-7B",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f110cf50e50>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/args.json ADDED Viewed

	@@ -0,0 +1,380 @@

+{
+  "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 4,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 2,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 1e-06,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 1.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": null,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 48,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "channels": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "Qwen/Qwen2.5-7B",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 16240,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "corr_nq_1500q_hotpot_1500q_swift.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.001,
+  "dataset_num_proc": 100,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": false,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": false,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "Qwen2.5-7B",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f110cf50e50>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.2",
+  "use_cache": false,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151643,
+    151645
+  ],
+  "max_new_tokens": 2048,
+  "transformers_version": "4.55.2"
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step156

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b87cdeb58da9cd139c476cb6a73718a6e69aa548c0224c2e87069b4ac45c0dc
+size 4877660776

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9c630c69ecaa5cb94cf762cc34b434563974fd30f50796fbdba2e5ad287b5b
+size 4932751008

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81ee7b17fa8f0376b693b482bf7f664b8c2a595f6654132c3f3c79fd1330351a
+size 4330865200

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7462672d71e84831bd8ae2b647bd51838fc474fbc5433d388b78b346b27c9e09
+size 1089994880

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,347 @@

+{
+  "metadata": {
+    "total_parameters": 333312,
+    "total_size": 15231233024
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51627582b1af659f6db5d1638cb26637bef530d73f44ecfcbdd414c1b591ef2b
+size 1465

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1300 @@

+{
+  "best_global_step": 156,
+  "best_metric": 0.55147469,
+  "best_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/checkpoint-156",
+  "epoch": 1.0,
+  "eval_steps": 20.0,
+  "global_step": 156,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00641025641025641,
+      "grad_norm": 204.5527273204477,
+      "learning_rate": 1.25e-07,
+      "loss": 1.3332561254501343,
+      "step": 1,
+      "token_acc": 0.7094771265983582
+    },
+    {
+      "epoch": 0.01282051282051282,
+      "grad_norm": 472.01805132730743,
+      "learning_rate": 2.5e-07,
+      "loss": 1.2919142246246338,
+      "step": 2,
+      "token_acc": 0.7154364585876465
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 295.9011538877219,
+      "learning_rate": 3.75e-07,
+      "loss": 1.3103106021881104,
+      "step": 3,
+      "token_acc": 0.7126436829566956
+    },
+    {
+      "epoch": 0.02564102564102564,
+      "grad_norm": 420.5023387419371,
+      "learning_rate": 5e-07,
+      "loss": 1.356910228729248,
+      "step": 4,
+      "token_acc": 0.7017189264297485
+    },
+    {
+      "epoch": 0.03205128205128205,
+      "grad_norm": 532.4883519493468,
+      "learning_rate": 6.249999999999999e-07,
+      "loss": 1.3429011106491089,
+      "step": 5,
+      "token_acc": 0.7035235166549683
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 745.1533569281419,
+      "learning_rate": 7.5e-07,
+      "loss": 1.3432822227478027,
+      "step": 6,
+      "token_acc": 0.704049289226532
+    },
+    {
+      "epoch": 0.04487179487179487,
+      "grad_norm": 316.9597378880501,
+      "learning_rate": 8.75e-07,
+      "loss": 1.2372357845306396,
+      "step": 7,
+      "token_acc": 0.719425618648529
+    },
+    {
+      "epoch": 0.05128205128205128,
+      "grad_norm": 80.89470277766097,
+      "learning_rate": 1e-06,
+      "loss": 1.1821542978286743,
+      "step": 8,
+      "token_acc": 0.7098754644393921
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 194.43529650602088,
+      "learning_rate": 9.998873580873846e-07,
+      "loss": 1.19064462184906,
+      "step": 9,
+      "token_acc": 0.706679105758667
+    },
+    {
+      "epoch": 0.0641025641025641,
+      "grad_norm": 99.3947168866689,
+      "learning_rate": 9.995494831023408e-07,
+      "loss": 1.1448092460632324,
+      "step": 10,
+      "token_acc": 0.7132793068885803
+    },
+    {
+      "epoch": 0.07051282051282051,
+      "grad_norm": 84.23475207475708,
+      "learning_rate": 9.989865272804063e-07,
+      "loss": 1.0852806568145752,
+      "step": 11,
+      "token_acc": 0.7265625
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 85.35629370266734,
+      "learning_rate": 9.98198744271263e-07,
+      "loss": 1.032708764076233,
+      "step": 12,
+      "token_acc": 0.7359824776649475
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 29.61142573444334,
+      "learning_rate": 9.971864890244513e-07,
+      "loss": 1.0187692642211914,
+      "step": 13,
+      "token_acc": 0.7330501675605774
+    },
+    {
+      "epoch": 0.08974358974358974,
+      "grad_norm": 53.80220392943845,
+      "learning_rate": 9.959502176294382e-07,
+      "loss": 1.0328643321990967,
+      "step": 14,
+      "token_acc": 0.733631432056427
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 26.63791401902767,
+      "learning_rate": 9.944904871101226e-07,
+      "loss": 0.9353116750717163,
+      "step": 15,
+      "token_acc": 0.7549552917480469
+    },
+    {
+      "epoch": 0.10256410256410256,
+      "grad_norm": 241.8763036480847,
+      "learning_rate": 9.928079551738541e-07,
+      "loss": 0.8937969207763672,
+      "step": 16,
+      "token_acc": 0.7634074687957764
+    },
+    {
+      "epoch": 0.10897435897435898,
+      "grad_norm": 260.0451355494914,
+      "learning_rate": 9.909033799150946e-07,
+      "loss": 0.9060311317443848,
+      "step": 17,
+      "token_acc": 0.7678724527359009
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 13.610113077700959,
+      "learning_rate": 9.887776194738431e-07,
+      "loss": 0.8835141658782959,
+      "step": 18,
+      "token_acc": 0.7689903378486633
+    },
+    {
+      "epoch": 0.12179487179487179,
+      "grad_norm": 13.22296824616019,
+      "learning_rate": 9.864316316489872e-07,
+      "loss": 0.8617175817489624,
+      "step": 19,
+      "token_acc": 0.7724306583404541
+    },
+    {
+      "epoch": 0.1282051282051282,
+      "grad_norm": 10.539571124679378,
+      "learning_rate": 9.838664734667495e-07,
+      "loss": 0.834773063659668,
+      "step": 20,
+      "token_acc": 0.7823472023010254
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 10.157244927886453,
+      "learning_rate": 9.810833007044246e-07,
+      "loss": 0.8732649087905884,
+      "step": 21,
+      "token_acc": 0.7687795162200928
+    },
+    {
+      "epoch": 0.14102564102564102,
+      "grad_norm": 8.895664285984349,
+      "learning_rate": 9.780833673696254e-07,
+      "loss": 0.8539620041847229,
+      "step": 22,
+      "token_acc": 0.7716691493988037
+    },
+    {
+      "epoch": 0.14743589743589744,
+      "grad_norm": 8.046583911365952,
+      "learning_rate": 9.748680251352658e-07,
+      "loss": 0.8183756470680237,
+      "step": 23,
+      "token_acc": 0.777586817741394
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 6.409869446484142,
+      "learning_rate": 9.71438722730542e-07,
+      "loss": 0.7400453090667725,
+      "step": 24,
+      "token_acc": 0.7995849847793579
+    },
+    {
+      "epoch": 0.16025641025641027,
+      "grad_norm": 6.146343328005812,
+      "learning_rate": 9.67797005288181e-07,
+      "loss": 0.7597711682319641,
+      "step": 25,
+      "token_acc": 0.7876543402671814
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 6.460835549120558,
+      "learning_rate": 9.639445136482546e-07,
+      "loss": 0.7657437324523926,
+      "step": 26,
+      "token_acc": 0.7866899967193604
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 5.989613299220979,
+      "learning_rate": 9.598829836188693e-07,
+      "loss": 0.7088508009910583,
+      "step": 27,
+      "token_acc": 0.8035452961921692
+    },
+    {
+      "epoch": 0.1794871794871795,
+      "grad_norm": 6.862877869409243,
+      "learning_rate": 9.556142451940679e-07,
+      "loss": 0.6949191093444824,
+      "step": 28,
+      "token_acc": 0.8057082295417786
+    },
+    {
+      "epoch": 0.1858974358974359,
+      "grad_norm": 5.454999319627048,
+      "learning_rate": 9.511402217292925e-07,
+      "loss": 0.6835302114486694,
+      "step": 29,
+      "token_acc": 0.8084285259246826
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 5.4625192461823975,
+      "learning_rate": 9.464629290747842e-07,
+      "loss": 0.6599457263946533,
+      "step": 30,
+      "token_acc": 0.8131645321846008
+    },
+    {
+      "epoch": 0.1987179487179487,
+      "grad_norm": 5.908943864151192,
+      "learning_rate": 9.415844746673046e-07,
+      "loss": 0.6833065748214722,
+      "step": 31,
+      "token_acc": 0.8138014078140259
+    },
+    {
+      "epoch": 0.20512820512820512,
+      "grad_norm": 5.288567441191335,
+      "learning_rate": 9.36507056580594e-07,
+      "loss": 0.6861380338668823,
+      "step": 32,
+      "token_acc": 0.8055645823478699
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 5.357065287460616,
+      "learning_rate": 9.312329625349901e-07,
+      "loss": 0.633919894695282,
+      "step": 33,
+      "token_acc": 0.818831741809845
+    },
+    {
+      "epoch": 0.21794871794871795,
+      "grad_norm": 5.160602546431693,
+      "learning_rate": 9.257645688666555e-07,
+      "loss": 0.6426697969436646,
+      "step": 34,
+      "token_acc": 0.817148745059967
+    },
+    {
+      "epoch": 0.22435897435897437,
+      "grad_norm": 4.864703496205445,
+      "learning_rate": 9.201043394568771e-07,
+      "loss": 0.6970824003219604,
+      "step": 35,
+      "token_acc": 0.8054612874984741
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 5.209842274657687,
+      "learning_rate": 9.14254824621921e-07,
+      "loss": 0.6301164627075195,
+      "step": 36,
+      "token_acc": 0.8198198080062866
+    },
+    {
+      "epoch": 0.23717948717948717,
+      "grad_norm": 5.060855111856781,
+      "learning_rate": 9.082186599639427e-07,
+      "loss": 0.6474230885505676,
+      "step": 37,
+      "token_acc": 0.8159399032592773
+    },
+    {
+      "epoch": 0.24358974358974358,
+      "grad_norm": 4.618117753589896,
+      "learning_rate": 9.019985651834703e-07,
+      "loss": 0.6971738338470459,
+      "step": 38,
+      "token_acc": 0.805202841758728
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 4.667129721936595,
+      "learning_rate": 8.955973428539942e-07,
+      "loss": 0.6257902383804321,
+      "step": 39,
+      "token_acc": 0.8184399604797363
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 4.758674560079032,
+      "learning_rate": 8.890178771592197e-07,
+      "loss": 0.6510574221611023,
+      "step": 40,
+      "token_acc": 0.8164493441581726
+    },
+    {
+      "epoch": 0.26282051282051283,
+      "grad_norm": 4.886139426025786,
+      "learning_rate": 8.822631325935463e-07,
+      "loss": 0.636375367641449,
+      "step": 41,
+      "token_acc": 0.8183852434158325
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 4.736579679060144,
+      "learning_rate": 8.753361526263621e-07,
+      "loss": 0.6564359664916992,
+      "step": 42,
+      "token_acc": 0.8196941614151001
+    },
+    {
+      "epoch": 0.27564102564102566,
+      "grad_norm": 4.5575813316907805,
+      "learning_rate": 8.68240058330756e-07,
+      "loss": 0.638996958732605,
+      "step": 43,
+      "token_acc": 0.8199967741966248
+    },
+    {
+      "epoch": 0.28205128205128205,
+      "grad_norm": 4.994153842899941,
+      "learning_rate": 8.609780469772621e-07,
+      "loss": 0.6362872123718262,
+      "step": 44,
+      "token_acc": 0.8194926977157593
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 4.549016612724467,
+      "learning_rate": 8.535533905932737e-07,
+      "loss": 0.6394486427307129,
+      "step": 45,
+      "token_acc": 0.8174207210540771
+    },
+    {
+      "epoch": 0.2948717948717949,
+      "grad_norm": 4.769737749020403,
+      "learning_rate": 8.459694344887731e-07,
+      "loss": 0.6170175075531006,
+      "step": 46,
+      "token_acc": 0.8237839937210083
+    },
+    {
+      "epoch": 0.30128205128205127,
+      "grad_norm": 4.680957748858921,
+      "learning_rate": 8.382295957490435e-07,
+      "loss": 0.5933050513267517,
+      "step": 47,
+      "token_acc": 0.829018235206604
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 4.52158152824765,
+      "learning_rate": 8.303373616950406e-07,
+      "loss": 0.6095594167709351,
+      "step": 48,
+      "token_acc": 0.8250677585601807
+    },
+    {
+      "epoch": 0.3141025641025641,
+      "grad_norm": 4.8131319269060135,
+      "learning_rate": 8.222962883121195e-07,
+      "loss": 0.586349368095398,
+      "step": 49,
+      "token_acc": 0.8342939615249634
+    },
+    {
+      "epoch": 0.32051282051282054,
+      "grad_norm": 4.874040165169469,
+      "learning_rate": 8.141099986478212e-07,
+      "loss": 0.6297792196273804,
+      "step": 50,
+      "token_acc": 0.8187986016273499
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 4.942277020634408,
+      "learning_rate": 8.057821811794457e-07,
+      "loss": 0.5949305295944214,
+      "step": 51,
+      "token_acc": 0.8311688303947449
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 4.3953959699546505,
+      "learning_rate": 7.973165881521433e-07,
+      "loss": 0.5739504098892212,
+      "step": 52,
+      "token_acc": 0.8309744596481323
+    },
+    {
+      "epoch": 0.33974358974358976,
+      "grad_norm": 4.735913113697843,
+      "learning_rate": 7.88717033888274e-07,
+      "loss": 0.6172730326652527,
+      "step": 53,
+      "token_acc": 0.821295976638794
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 4.378256610554005,
+      "learning_rate": 7.799873930687977e-07,
+      "loss": 0.5573772192001343,
+      "step": 54,
+      "token_acc": 0.8451364636421204
+    },
+    {
+      "epoch": 0.3525641025641026,
+      "grad_norm": 4.684218984763407,
+      "learning_rate": 7.711315989874676e-07,
+      "loss": 0.5717079639434814,
+      "step": 55,
+      "token_acc": 0.838281512260437
+    },
+    {
+      "epoch": 0.358974358974359,
+      "grad_norm": 4.4280118932792325,
+      "learning_rate": 7.621536417786158e-07,
+      "loss": 0.59979248046875,
+      "step": 56,
+      "token_acc": 0.8215728998184204
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 4.826729862989936,
+      "learning_rate": 7.530575666193282e-07,
+      "loss": 0.554166316986084,
+      "step": 57,
+      "token_acc": 0.8405694961547852
+    },
+    {
+      "epoch": 0.3717948717948718,
+      "grad_norm": 4.359263862107659,
+      "learning_rate": 7.438474719068173e-07,
+      "loss": 0.5708739161491394,
+      "step": 58,
+      "token_acc": 0.834990382194519
+    },
+    {
+      "epoch": 0.3782051282051282,
+      "grad_norm": 4.828866436495037,
+      "learning_rate": 7.345275074118185e-07,
+      "loss": 0.5425753593444824,
+      "step": 59,
+      "token_acc": 0.8373525142669678
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 4.598613239569117,
+      "learning_rate": 7.251018724088366e-07,
+      "loss": 0.5623891353607178,
+      "step": 60,
+      "token_acc": 0.8328151702880859
+    },
+    {
+      "epoch": 0.391025641025641,
+      "grad_norm": 4.840462028626115,
+      "learning_rate": 7.155748137840892e-07,
+      "loss": 0.5742952227592468,
+      "step": 61,
+      "token_acc": 0.8290673494338989
+    },
+    {
+      "epoch": 0.3974358974358974,
+      "grad_norm": 4.23328233570671,
+      "learning_rate": 7.059506241219964e-07,
+      "loss": 0.586388885974884,
+      "step": 62,
+      "token_acc": 0.8265019059181213
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 4.393885373170824,
+      "learning_rate": 6.962336397710819e-07,
+      "loss": 0.5368714332580566,
+      "step": 63,
+      "token_acc": 0.8431768417358398
+    },
+    {
+      "epoch": 0.41025641025641024,
+      "grad_norm": 4.662530184420074,
+      "learning_rate": 6.864282388901543e-07,
+      "loss": 0.6058262586593628,
+      "step": 64,
+      "token_acc": 0.8214831352233887
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 4.558811999067238,
+      "learning_rate": 6.765388394756504e-07,
+      "loss": 0.5562135577201843,
+      "step": 65,
+      "token_acc": 0.8388559222221375
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 4.765395712458729,
+      "learning_rate": 6.665698973710288e-07,
+      "loss": 0.5610250234603882,
+      "step": 66,
+      "token_acc": 0.8305501937866211
+    },
+    {
+      "epoch": 0.42948717948717946,
+      "grad_norm": 4.5677446123493635,
+      "learning_rate": 6.565259042591111e-07,
+      "loss": 0.5418926477432251,
+      "step": 67,
+      "token_acc": 0.8431202173233032
+    },
+    {
+      "epoch": 0.4358974358974359,
+      "grad_norm": 4.529946815527597,
+      "learning_rate": 6.464113856382751e-07,
+      "loss": 0.5533748865127563,
+      "step": 68,
+      "token_acc": 0.8329079747200012
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 4.457221273065978,
+      "learning_rate": 6.362308987834115e-07,
+      "loss": 0.5477522015571594,
+      "step": 69,
+      "token_acc": 0.8378193974494934
+    },
+    {
+      "epoch": 0.44871794871794873,
+      "grad_norm": 4.611531783389826,
+      "learning_rate": 6.259890306925626e-07,
+      "loss": 0.5534911751747131,
+      "step": 70,
+      "token_acc": 0.841196596622467
+    },
+    {
+      "epoch": 0.4551282051282051,
+      "grad_norm": 4.418661988438471,
+      "learning_rate": 6.156903960201708e-07,
+      "loss": 0.5345719456672668,
+      "step": 71,
+      "token_acc": 0.8396868705749512
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 4.438614534821985,
+      "learning_rate": 6.053396349978631e-07,
+      "loss": 0.573357343673706,
+      "step": 72,
+      "token_acc": 0.8328611850738525
+    },
+    {
+      "epoch": 0.46794871794871795,
+      "grad_norm": 4.304860173689802,
+      "learning_rate": 5.949414113437141e-07,
+      "loss": 0.548120379447937,
+      "step": 73,
+      "token_acc": 0.8377840518951416
+    },
+    {
+      "epoch": 0.47435897435897434,
+      "grad_norm": 4.194319397177033,
+      "learning_rate": 5.845004101609246e-07,
+      "loss": 0.554309606552124,
+      "step": 74,
+      "token_acc": 0.8362563848495483
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 4.6298008452210615,
+      "learning_rate": 5.740213358268658e-07,
+      "loss": 0.5796681642532349,
+      "step": 75,
+      "token_acc": 0.8328510522842407
+    },
+    {
+      "epoch": 0.48717948717948717,
+      "grad_norm": 4.355472281518206,
+      "learning_rate": 5.635089098734393e-07,
+      "loss": 0.5438037514686584,
+      "step": 76,
+      "token_acc": 0.8434426188468933
+    },
+    {
+      "epoch": 0.4935897435897436,
+      "grad_norm": 4.627740617106088,
+      "learning_rate": 5.52967868859708e-07,
+      "loss": 0.5334382653236389,
+      "step": 77,
+      "token_acc": 0.8434890508651733
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 4.2780408616986945,
+      "learning_rate": 5.424029622377546e-07,
+      "loss": 0.5313788652420044,
+      "step": 78,
+      "token_acc": 0.8449059128761292
+    },
+    {
+      "epoch": 0.5064102564102564,
+      "grad_norm": 4.092855208873091,
+      "learning_rate": 5.318189502127331e-07,
+      "loss": 0.5232336521148682,
+      "step": 79,
+      "token_acc": 0.8468860983848572
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 4.428993590455852,
+      "learning_rate": 5.212206015980741e-07,
+      "loss": 0.5497955083847046,
+      "step": 80,
+      "token_acc": 0.8446012735366821
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 4.21577617143911,
+      "learning_rate": 5.106126916668118e-07,
+      "loss": 0.546926736831665,
+      "step": 81,
+      "token_acc": 0.8366925716400146
+    },
+    {
+      "epoch": 0.5256410256410257,
+      "grad_norm": 4.438862747258348,
+      "learning_rate": 5e-07,
+      "loss": 0.5270861387252808,
+      "step": 82,
+      "token_acc": 0.8371419310569763
+    },
+    {
+      "epoch": 0.532051282051282,
+      "grad_norm": 4.176826355296084,
+      "learning_rate": 4.893873083331882e-07,
+      "loss": 0.5339565277099609,
+      "step": 83,
+      "token_acc": 0.8417102098464966
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 4.530198529065255,
+      "learning_rate": 4.787793984019259e-07,
+      "loss": 0.5424758195877075,
+      "step": 84,
+      "token_acc": 0.8395857810974121
+    },
+    {
+      "epoch": 0.5448717948717948,
+      "grad_norm": 4.370509876032819,
+      "learning_rate": 4.681810497872668e-07,
+      "loss": 0.5200133919715881,
+      "step": 85,
+      "token_acc": 0.8433917760848999
+    },
+    {
+      "epoch": 0.5512820512820513,
+      "grad_norm": 4.382905385719549,
+      "learning_rate": 4.5759703776224555e-07,
+      "loss": 0.5480057597160339,
+      "step": 86,
+      "token_acc": 0.8397650718688965
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 4.342442518129754,
+      "learning_rate": 4.47032131140292e-07,
+      "loss": 0.5228407382965088,
+      "step": 87,
+      "token_acc": 0.8539518713951111
+    },
+    {
+      "epoch": 0.5641025641025641,
+      "grad_norm": 4.137341022557355,
+      "learning_rate": 4.364910901265606e-07,
+      "loss": 0.5413128137588501,
+      "step": 88,
+      "token_acc": 0.8368589878082275
+    },
+    {
+      "epoch": 0.5705128205128205,
+      "grad_norm": 4.369749651287342,
+      "learning_rate": 4.259786641731343e-07,
+      "loss": 0.5160300731658936,
+      "step": 89,
+      "token_acc": 0.8478597402572632
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 4.5684142339974585,
+      "learning_rate": 4.154995898390755e-07,
+      "loss": 0.5385915637016296,
+      "step": 90,
+      "token_acc": 0.8457493185997009
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 4.263077787933313,
+      "learning_rate": 4.0505858865628575e-07,
+      "loss": 0.5259469747543335,
+      "step": 91,
+      "token_acc": 0.8433637022972107
+    },
+    {
+      "epoch": 0.5897435897435898,
+      "grad_norm": 4.246658460460572,
+      "learning_rate": 3.94660365002137e-07,
+      "loss": 0.5241655111312866,
+      "step": 92,
+      "token_acc": 0.8478260636329651
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 4.4264562426283325,
+      "learning_rate": 3.8430960397982926e-07,
+      "loss": 0.5464353561401367,
+      "step": 93,
+      "token_acc": 0.8348221778869629
+    },
+    {
+      "epoch": 0.6025641025641025,
+      "grad_norm": 4.571352573379081,
+      "learning_rate": 3.7401096930743746e-07,
+      "loss": 0.5346203446388245,
+      "step": 94,
+      "token_acc": 0.8426615595817566
+    },
+    {
+      "epoch": 0.6089743589743589,
+      "grad_norm": 4.302883692806041,
+      "learning_rate": 3.637691012165886e-07,
+      "loss": 0.4898928999900818,
+      "step": 95,
+      "token_acc": 0.8545727133750916
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 4.488514668810176,
+      "learning_rate": 3.535886143617248e-07,
+      "loss": 0.5597435235977173,
+      "step": 96,
+      "token_acc": 0.8423628211021423
+    },
+    {
+      "epoch": 0.6217948717948718,
+      "grad_norm": 4.5047373126179036,
+      "learning_rate": 3.4347409574088894e-07,
+      "loss": 0.5908330082893372,
+      "step": 97,
+      "token_acc": 0.831405758857727
+    },
+    {
+      "epoch": 0.6282051282051282,
+      "grad_norm": 4.326989989523886,
+      "learning_rate": 3.334301026289712e-07,
+      "loss": 0.5422452092170715,
+      "step": 98,
+      "token_acc": 0.8455585837364197
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 4.071817598010613,
+      "learning_rate": 3.234611605243496e-07,
+      "loss": 0.4947037994861603,
+      "step": 99,
+      "token_acc": 0.8524978756904602
+    },
+    {
+      "epoch": 0.6410256410256411,
+      "grad_norm": 4.364292580374463,
+      "learning_rate": 3.135717611098457e-07,
+      "loss": 0.4820348620414734,
+      "step": 100,
+      "token_acc": 0.8573145866394043
+    },
+    {
+      "epoch": 0.6474358974358975,
+      "grad_norm": 4.245101360198055,
+      "learning_rate": 3.037663602289181e-07,
+      "loss": 0.577752411365509,
+      "step": 101,
+      "token_acc": 0.8319244384765625
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 4.3455778338843,
+      "learning_rate": 2.940493758780037e-07,
+      "loss": 0.5117242336273193,
+      "step": 102,
+      "token_acc": 0.8504513502120972
+    },
+    {
+      "epoch": 0.6602564102564102,
+      "grad_norm": 4.354835545935755,
+      "learning_rate": 2.8442518621591084e-07,
+      "loss": 0.5905250310897827,
+      "step": 103,
+      "token_acc": 0.829204261302948
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 4.373092445004486,
+      "learning_rate": 2.748981275911633e-07,
+      "loss": 0.5381240248680115,
+      "step": 104,
+      "token_acc": 0.83758544921875
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 4.158187745451947,
+      "learning_rate": 2.6547249258818163e-07,
+      "loss": 0.5173760056495667,
+      "step": 105,
+      "token_acc": 0.8419111371040344
+    },
+    {
+      "epoch": 0.6794871794871795,
+      "grad_norm": 4.360270954918175,
+      "learning_rate": 2.561525280931828e-07,
+      "loss": 0.5291812419891357,
+      "step": 106,
+      "token_acc": 0.8481848239898682
+    },
+    {
+      "epoch": 0.6858974358974359,
+      "grad_norm": 4.394654818317897,
+      "learning_rate": 2.469424333806718e-07,
+      "loss": 0.5495140552520752,
+      "step": 107,
+      "token_acc": 0.8393661379814148
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 4.4823775203281,
+      "learning_rate": 2.378463582213842e-07,
+      "loss": 0.528519868850708,
+      "step": 108,
+      "token_acc": 0.8453878164291382
+    },
+    {
+      "epoch": 0.6987179487179487,
+      "grad_norm": 4.219004456899406,
+      "learning_rate": 2.2886840101253247e-07,
+      "loss": 0.4981050491333008,
+      "step": 109,
+      "token_acc": 0.8480120897293091
+    },
+    {
+      "epoch": 0.7051282051282052,
+      "grad_norm": 4.529812908764033,
+      "learning_rate": 2.2001260693120232e-07,
+      "loss": 0.5449967980384827,
+      "step": 110,
+      "token_acc": 0.838426411151886
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 4.264681112622175,
+      "learning_rate": 2.112829661117259e-07,
+      "loss": 0.5300467610359192,
+      "step": 111,
+      "token_acc": 0.8408368229866028
+    },
+    {
+      "epoch": 0.717948717948718,
+      "grad_norm": 4.224432096547358,
+      "learning_rate": 2.026834118478567e-07,
+      "loss": 0.5546152591705322,
+      "step": 112,
+      "token_acc": 0.8335220813751221
+    },
+    {
+      "epoch": 0.7243589743589743,
+      "grad_norm": 4.40275179389628,
+      "learning_rate": 1.9421781882055443e-07,
+      "loss": 0.5911360383033752,
+      "step": 113,
+      "token_acc": 0.8269292712211609
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 4.438604539947117,
+      "learning_rate": 1.858900013521788e-07,
+      "loss": 0.5440055131912231,
+      "step": 114,
+      "token_acc": 0.8439837098121643
+    },
+    {
+      "epoch": 0.7371794871794872,
+      "grad_norm": 4.3373601365266605,
+      "learning_rate": 1.777037116878804e-07,
+      "loss": 0.5248892307281494,
+      "step": 115,
+      "token_acc": 0.8428595662117004
+    },
+    {
+      "epoch": 0.7435897435897436,
+      "grad_norm": 3.9787786664284495,
+      "learning_rate": 1.6966263830495935e-07,
+      "loss": 0.5044262409210205,
+      "step": 116,
+      "token_acc": 0.8520953059196472
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 4.206466011537137,
+      "learning_rate": 1.6177040425095663e-07,
+      "loss": 0.5238938331604004,
+      "step": 117,
+      "token_acc": 0.8503661751747131
+    },
+    {
+      "epoch": 0.7564102564102564,
+      "grad_norm": 4.11543543662316,
+      "learning_rate": 1.5403056551122694e-07,
+      "loss": 0.4803212583065033,
+      "step": 118,
+      "token_acc": 0.861027181148529
+    },
+    {
+      "epoch": 0.7628205128205128,
+      "grad_norm": 4.307365738144286,
+      "learning_rate": 1.4644660940672627e-07,
+      "loss": 0.5092470645904541,
+      "step": 119,
+      "token_acc": 0.847130537033081
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 4.228969596612662,
+      "learning_rate": 1.3902195302273778e-07,
+      "loss": 0.5285454988479614,
+      "step": 120,
+      "token_acc": 0.8402243852615356
+    },
+    {
+      "epoch": 0.7756410256410257,
+      "grad_norm": 4.317437082837098,
+      "learning_rate": 1.3175994166924392e-07,
+      "loss": 0.5160696506500244,
+      "step": 121,
+      "token_acc": 0.843143880367279
+    },
+    {
+      "epoch": 0.782051282051282,
+      "grad_norm": 4.206230433373526,
+      "learning_rate": 1.2466384737363779e-07,
+      "loss": 0.5897269248962402,
+      "step": 122,
+      "token_acc": 0.8222737908363342
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 4.537270673759693,
+      "learning_rate": 1.1773686740645383e-07,
+      "loss": 0.5243645906448364,
+      "step": 123,
+      "token_acc": 0.8434827327728271
+    },
+    {
+      "epoch": 0.7948717948717948,
+      "grad_norm": 4.338972097867842,
+      "learning_rate": 1.1098212284078035e-07,
+      "loss": 0.5309593677520752,
+      "step": 124,
+      "token_acc": 0.8410027623176575
+    },
+    {
+      "epoch": 0.8012820512820513,
+      "grad_norm": 4.172268942442711,
+      "learning_rate": 1.0440265714600571e-07,
+      "loss": 0.5152863264083862,
+      "step": 125,
+      "token_acc": 0.8451645970344543
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 4.250149736675509,
+      "learning_rate": 9.800143481652979e-08,
+      "loss": 0.5396726131439209,
+      "step": 126,
+      "token_acc": 0.8436694741249084
+    },
+    {
+      "epoch": 0.8141025641025641,
+      "grad_norm": 4.405607345694319,
+      "learning_rate": 9.17813400360572e-08,
+      "loss": 0.5297789573669434,
+      "step": 127,
+      "token_acc": 0.8374651074409485
+    },
+    {
+      "epoch": 0.8205128205128205,
+      "grad_norm": 4.256107645791666,
+      "learning_rate": 8.574517537807896e-08,
+      "loss": 0.4773644208908081,
+      "step": 128,
+      "token_acc": 0.8574733138084412
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 4.402345485095196,
+      "learning_rate": 7.989566054312286e-08,
+      "loss": 0.5019025802612305,
+      "step": 129,
+      "token_acc": 0.8487906455993652
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 4.222991640512232,
+      "learning_rate": 7.423543113334435e-08,
+      "loss": 0.509482741355896,
+      "step": 130,
+      "token_acc": 0.8477755784988403
+    },
+    {
+      "epoch": 0.8397435897435898,
+      "grad_norm": 4.331972240710486,
+      "learning_rate": 6.876703746500984e-08,
+      "loss": 0.5533698201179504,
+      "step": 131,
+      "token_acc": 0.8373857736587524
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 4.085679298862501,
+      "learning_rate": 6.349294341940592e-08,
+      "loss": 0.5067212581634521,
+      "step": 132,
+      "token_acc": 0.8518636226654053
+    },
+    {
+      "epoch": 0.8525641025641025,
+      "grad_norm": 4.273453737642771,
+      "learning_rate": 5.8415525332695334e-08,
+      "loss": 0.49411821365356445,
+      "step": 133,
+      "token_acc": 0.8478958606719971
+    },
+    {
+      "epoch": 0.8589743589743589,
+      "grad_norm": 4.262340473425649,
+      "learning_rate": 5.353707092521581e-08,
+      "loss": 0.5529794692993164,
+      "step": 134,
+      "token_acc": 0.8371466994285583
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 4.382139460926817,
+      "learning_rate": 4.885977827070747e-08,
+      "loss": 0.509425699710846,
+      "step": 135,
+      "token_acc": 0.8512613773345947
+    },
+    {
+      "epoch": 0.8717948717948718,
+      "grad_norm": 4.62171216340832,
+      "learning_rate": 4.4385754805932095e-08,
+      "loss": 0.5211268067359924,
+      "step": 136,
+      "token_acc": 0.84375
+    },
+    {
+      "epoch": 0.8782051282051282,
+      "grad_norm": 4.204088741197678,
+      "learning_rate": 4.0117016381130634e-08,
+      "loss": 0.49088388681411743,
+      "step": 137,
+      "token_acc": 0.8564977645874023
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 4.377681174151091,
+      "learning_rate": 3.6055486351745324e-08,
+      "loss": 0.5041382312774658,
+      "step": 138,
+      "token_acc": 0.8541952967643738
+    },
+    {
+      "epoch": 0.8910256410256411,
+      "grad_norm": 4.32917767077044,
+      "learning_rate": 3.220299471181898e-08,
+      "loss": 0.5480432510375977,
+      "step": 139,
+      "token_acc": 0.8370358347892761
+    },
+    {
+      "epoch": 0.8974358974358975,
+      "grad_norm": 4.190278678307778,
+      "learning_rate": 2.8561277269457895e-08,
+      "loss": 0.5408469438552856,
+      "step": 140,
+      "token_acc": 0.8408304452896118
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 4.429799923194152,
+      "learning_rate": 2.513197486473406e-08,
+      "loss": 0.5250701904296875,
+      "step": 141,
+      "token_acc": 0.8404897451400757
+    },
+    {
+      "epoch": 0.9102564102564102,
+      "grad_norm": 4.240556447333352,
+      "learning_rate": 2.1916632630374577e-08,
+      "loss": 0.49129876494407654,
+      "step": 142,
+      "token_acc": 0.8528439402580261
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 4.094432099971454,
+      "learning_rate": 1.8916699295575323e-08,
+      "loss": 0.5107077360153198,
+      "step": 143,
+      "token_acc": 0.8480400443077087
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 4.096105481468722,
+      "learning_rate": 1.6133526533250563e-08,
+      "loss": 0.48216086626052856,
+      "step": 144,
+      "token_acc": 0.8517335057258606
+    },
+    {
+      "epoch": 0.9294871794871795,
+      "grad_norm": 3.999889432088483,
+      "learning_rate": 1.3568368351012716e-08,
+      "loss": 0.4809270203113556,
+      "step": 145,
+      "token_acc": 0.8589807748794556
+    },
+    {
+      "epoch": 0.9358974358974359,
+      "grad_norm": 4.312742288765351,
+      "learning_rate": 1.1222380526156927e-08,
+      "loss": 0.4755428731441498,
+      "step": 146,
+      "token_acc": 0.8620191216468811
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 4.535728213741053,
+      "learning_rate": 9.096620084905471e-09,
+      "loss": 0.5387917757034302,
+      "step": 147,
+      "token_acc": 0.8353335857391357
+    },
+    {
+      "epoch": 0.9487179487179487,
+      "grad_norm": 4.1679468407507905,
+      "learning_rate": 7.1920448261457715e-09,
+      "loss": 0.4899570345878601,
+      "step": 148,
+      "token_acc": 0.8593058586120605
+    },
+    {
+      "epoch": 0.9551282051282052,
+      "grad_norm": 4.2351778138083604,
+      "learning_rate": 5.509512889877332e-09,
+      "loss": 0.5454012155532837,
+      "step": 149,
+      "token_acc": 0.8417900800704956
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 4.296677102692333,
+      "learning_rate": 4.049782370561583e-09,
+      "loss": 0.523182213306427,
+      "step": 150,
+      "token_acc": 0.8438036441802979
+    },
+    {
+      "epoch": 0.967948717948718,
+      "grad_norm": 4.323468547344599,
+      "learning_rate": 2.813510975548772e-09,
+      "loss": 0.5141420364379883,
+      "step": 151,
+      "token_acc": 0.8459829092025757
+    },
+    {
+      "epoch": 0.9743589743589743,
+      "grad_norm": 4.286188181441807,
+      "learning_rate": 1.8012557287367391e-09,
+      "loss": 0.5381837487220764,
+      "step": 152,
+      "token_acc": 0.8385339379310608
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 4.395230614166292,
+      "learning_rate": 1.013472719593733e-09,
+      "loss": 0.5164992809295654,
+      "step": 153,
+      "token_acc": 0.848739504814148
+    },
+    {
+      "epoch": 0.9871794871794872,
+      "grad_norm": 4.382167968792246,
+      "learning_rate": 4.5051689765929213e-10,
+      "loss": 0.5406173467636108,
+      "step": 154,
+      "token_acc": 0.8395528793334961
+    },
+    {
+      "epoch": 0.9935897435897436,
+      "grad_norm": 4.218184305013247,
+      "learning_rate": 1.1264191261528555e-10,
+      "loss": 0.5075889229774475,
+      "step": 155,
+      "token_acc": 0.8444151282310486
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.958298958687034,
+      "learning_rate": 0.0,
+      "loss": 0.5462541580200195,
+      "step": 156,
+      "token_acc": 0.8386925458908081
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.5514746904373169,
+      "eval_runtime": 10.0275,
+      "eval_samples_per_second": 0.898,
+      "eval_steps_per_second": 0.199,
+      "eval_token_acc": 0.8481510877609253,
+      "step": 156
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.5514746904373169,
+      "eval_runtime": 12.1526,
+      "eval_samples_per_second": 0.741,
+      "eval_steps_per_second": 0.165,
+      "eval_token_acc": 0.8481510877609253,
+      "step": 156
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 156,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 69981770547200.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72d9d5a75a35915924189be0b5520b4e1ac2518219811a411df1d2f6a24029b5
+size 9041

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/checkpoint-156/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_loss.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_runtime.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_samples_per_second.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_steps_per_second.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/eval_token_acc.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_epoch.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_grad_norm.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_learning_rate.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_loss.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_token_acc.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_total_flos.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_loss.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_runtime.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_samples_per_second.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/images/train_train_steps_per_second.png ADDED Viewed

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/logging.jsonl ADDED Viewed

	@@ -0,0 +1,160 @@

+{"loss": 1.33325613, "grad_norm": 204.55272732, "learning_rate": 1.2e-07, "token_acc": 0.70947713, "epoch": 0.00641026, "global_step/max_steps": "1/156", "percentage": "0.64%", "elapsed_time": "1m 33s", "remaining_time": "4h 0m 47s", "memory(GiB)": 24.96, "train_speed(iter/s)": 0.010728}
+{"loss": 1.29191422, "grad_norm": 472.01805133, "learning_rate": 2.5e-07, "token_acc": 0.71543646, "epoch": 0.01282051, "global_step/max_steps": "2/156", "percentage": "1.28%", "elapsed_time": "3m 5s", "remaining_time": "3h 58m 12s", "memory(GiB)": 37.77, "train_speed(iter/s)": 0.010775}
+{"loss": 1.3103106, "grad_norm": 295.90115389, "learning_rate": 3.8e-07, "token_acc": 0.71264368, "epoch": 0.01923077, "global_step/max_steps": "3/156", "percentage": "1.92%", "elapsed_time": "4m 30s", "remaining_time": "3h 50m 14s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011076}
+{"loss": 1.35691023, "grad_norm": 420.50233874, "learning_rate": 5e-07, "token_acc": 0.70171893, "epoch": 0.02564103, "global_step/max_steps": "4/156", "percentage": "2.56%", "elapsed_time": "5m 55s", "remaining_time": "3h 45m 0s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011258}
+{"loss": 1.34290111, "grad_norm": 532.48835195, "learning_rate": 6.2e-07, "token_acc": 0.70352352, "epoch": 0.03205128, "global_step/max_steps": "5/156", "percentage": "3.21%", "elapsed_time": "7m 13s", "remaining_time": "3h 38m 20s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011526}
+{"loss": 1.34328222, "grad_norm": 745.15335693, "learning_rate": 7.5e-07, "token_acc": 0.70404929, "epoch": 0.03846154, "global_step/max_steps": "6/156", "percentage": "3.85%", "elapsed_time": "8m 24s", "remaining_time": "3h 30m 9s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011896}
+{"loss": 1.23723578, "grad_norm": 316.95973789, "learning_rate": 8.7e-07, "token_acc": 0.71942562, "epoch": 0.04487179, "global_step/max_steps": "7/156", "percentage": "4.49%", "elapsed_time": "9m 51s", "remaining_time": "3h 29m 49s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011835}
+{"loss": 1.1821543, "grad_norm": 80.89470278, "learning_rate": 1e-06, "token_acc": 0.70987546, "epoch": 0.05128205, "global_step/max_steps": "8/156", "percentage": "5.13%", "elapsed_time": "11m 15s", "remaining_time": "3h 28m 25s", "memory(GiB)": 58.64, "train_speed(iter/s)": 0.011835}
+{"loss": 1.19064462, "grad_norm": 194.43529651, "learning_rate": 1e-06, "token_acc": 0.70667911, "epoch": 0.05769231, "global_step/max_steps": "9/156", "percentage": "5.77%", "elapsed_time": "12m 47s", "remaining_time": "3h 28m 51s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.01173}
+{"loss": 1.14480925, "grad_norm": 99.39471689, "learning_rate": 1e-06, "token_acc": 0.71327931, "epoch": 0.06410256, "global_step/max_steps": "10/156", "percentage": "6.41%", "elapsed_time": "14m 24s", "remaining_time": "3h 30m 27s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011562}
+{"loss": 1.08528066, "grad_norm": 84.23475207, "learning_rate": 1e-06, "token_acc": 0.7265625, "epoch": 0.07051282, "global_step/max_steps": "11/156", "percentage": "7.05%", "elapsed_time": "15m 39s", "remaining_time": "3h 26m 27s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011705}
+{"loss": 1.03270876, "grad_norm": 85.3562937, "learning_rate": 1e-06, "token_acc": 0.73598248, "epoch": 0.07692308, "global_step/max_steps": "12/156", "percentage": "7.69%", "elapsed_time": "16m 58s", "remaining_time": "3h 23m 46s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011778}
+{"loss": 1.01876926, "grad_norm": 29.61142573, "learning_rate": 1e-06, "token_acc": 0.73305017, "epoch": 0.08333333, "global_step/max_steps": "13/156", "percentage": "8.33%", "elapsed_time": "18m 23s", "remaining_time": "3h 22m 15s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011784}
+{"loss": 1.03286433, "grad_norm": 53.80220393, "learning_rate": 1e-06, "token_acc": 0.73363143, "epoch": 0.08974359, "global_step/max_steps": "14/156", "percentage": "8.97%", "elapsed_time": "19m 52s", "remaining_time": "3h 21m 33s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011742}
+{"loss": 0.93531168, "grad_norm": 26.63791402, "learning_rate": 9.9e-07, "token_acc": 0.75495529, "epoch": 0.09615385, "global_step/max_steps": "15/156", "percentage": "9.62%", "elapsed_time": "21m 14s", "remaining_time": "3h 19m 44s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011765}
+{"loss": 0.89379692, "grad_norm": 241.87630365, "learning_rate": 9.9e-07, "token_acc": 0.76340747, "epoch": 0.1025641, "global_step/max_steps": "16/156", "percentage": "10.26%", "elapsed_time": "22m 17s", "remaining_time": "3h 15m 6s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011959}
+{"loss": 0.90603113, "grad_norm": 260.04513555, "learning_rate": 9.9e-07, "token_acc": 0.76787245, "epoch": 0.10897436, "global_step/max_steps": "17/156", "percentage": "10.90%", "elapsed_time": "23m 29s", "remaining_time": "3h 12m 8s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012057}
+{"loss": 0.88351417, "grad_norm": 13.61011308, "learning_rate": 9.9e-07, "token_acc": 0.76899034, "epoch": 0.11538462, "global_step/max_steps": "18/156", "percentage": "11.54%", "elapsed_time": "24m 54s", "remaining_time": "3h 10m 58s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012044}
+{"loss": 0.86171758, "grad_norm": 13.22296825, "learning_rate": 9.9e-07, "token_acc": 0.77243066, "epoch": 0.12179487, "global_step/max_steps": "19/156", "percentage": "12.18%", "elapsed_time": "26m 21s", "remaining_time": "3h 10m 4s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012012}
+{"loss": 0.83477306, "grad_norm": 10.53957112, "learning_rate": 9.8e-07, "token_acc": 0.7823472, "epoch": 0.12820513, "global_step/max_steps": "20/156", "percentage": "12.82%", "elapsed_time": "28m 13s", "remaining_time": "3h 11m 57s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011808}
+{"loss": 0.87326491, "grad_norm": 10.15724493, "learning_rate": 9.8e-07, "token_acc": 0.76877952, "epoch": 0.13461538, "global_step/max_steps": "21/156", "percentage": "13.46%", "elapsed_time": "29m 44s", "remaining_time": "3h 11m 10s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011769}
+{"loss": 0.853962, "grad_norm": 8.89566429, "learning_rate": 9.8e-07, "token_acc": 0.77166915, "epoch": 0.14102564, "global_step/max_steps": "22/156", "percentage": "14.10%", "elapsed_time": "30m 52s", "remaining_time": "3h 8m 2s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011877}
+{"loss": 0.81837565, "grad_norm": 8.04658391, "learning_rate": 9.7e-07, "token_acc": 0.77758682, "epoch": 0.1474359, "global_step/max_steps": "23/156", "percentage": "14.74%", "elapsed_time": "32m 8s", "remaining_time": "3h 5m 49s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011929}
+{"loss": 0.74004531, "grad_norm": 6.40986945, "learning_rate": 9.7e-07, "token_acc": 0.79958498, "epoch": 0.15384615, "global_step/max_steps": "24/156", "percentage": "15.38%", "elapsed_time": "33m 29s", "remaining_time": "3h 4m 10s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011945}
+{"loss": 0.75977117, "grad_norm": 6.14634333, "learning_rate": 9.7e-07, "token_acc": 0.78765434, "epoch": 0.16025641, "global_step/max_steps": "25/156", "percentage": "16.03%", "elapsed_time": "34m 45s", "remaining_time": "3h 2m 6s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.011989}
+{"loss": 0.76574373, "grad_norm": 6.46083555, "learning_rate": 9.6e-07, "token_acc": 0.78669, "epoch": 0.16666667, "global_step/max_steps": "26/156", "percentage": "16.67%", "elapsed_time": "35m 53s", "remaining_time": "2h 59m 29s", "memory(GiB)": 79.68, "train_speed(iter/s)": 0.012071}
+{"loss": 0.7088508, "grad_norm": 5.9896133, "learning_rate": 9.6e-07, "token_acc": 0.8035453, "epoch": 0.17307692, "global_step/max_steps": "27/156", "percentage": "17.31%", "elapsed_time": "37m 10s", "remaining_time": "2h 57m 38s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012103}
+{"loss": 0.69491911, "grad_norm": 6.86287787, "learning_rate": 9.6e-07, "token_acc": 0.80570823, "epoch": 0.17948718, "global_step/max_steps": "28/156", "percentage": "17.95%", "elapsed_time": "38m 33s", "remaining_time": "2h 56m 16s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012102}
+{"loss": 0.68353021, "grad_norm": 5.45499932, "learning_rate": 9.5e-07, "token_acc": 0.80842853, "epoch": 0.18589744, "global_step/max_steps": "29/156", "percentage": "18.59%", "elapsed_time": "40m 4s", "remaining_time": "2h 55m 29s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012062}
+{"loss": 0.65994573, "grad_norm": 5.46251925, "learning_rate": 9.5e-07, "token_acc": 0.81316453, "epoch": 0.19230769, "global_step/max_steps": "30/156", "percentage": "19.23%", "elapsed_time": "41m 18s", "remaining_time": "2h 53m 30s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012104}
+{"loss": 0.68330657, "grad_norm": 5.90894386, "learning_rate": 9.4e-07, "token_acc": 0.81380141, "epoch": 0.19871795, "global_step/max_steps": "31/156", "percentage": "19.87%", "elapsed_time": "42m 29s", "remaining_time": "2h 51m 19s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012161}
+{"loss": 0.68613803, "grad_norm": 5.28856744, "learning_rate": 9.4e-07, "token_acc": 0.80556458, "epoch": 0.20512821, "global_step/max_steps": "32/156", "percentage": "20.51%", "elapsed_time": "43m 36s", "remaining_time": "2h 48m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01223}
+{"loss": 0.63391989, "grad_norm": 5.35706529, "learning_rate": 9.3e-07, "token_acc": 0.81883174, "epoch": 0.21153846, "global_step/max_steps": "33/156", "percentage": "21.15%", "elapsed_time": "44m 55s", "remaining_time": "2h 47m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012241}
+{"loss": 0.6426698, "grad_norm": 5.16060255, "learning_rate": 9.3e-07, "token_acc": 0.81714875, "epoch": 0.21794872, "global_step/max_steps": "34/156", "percentage": "21.79%", "elapsed_time": "46m 13s", "remaining_time": "2h 45m 51s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01226}
+{"loss": 0.6970824, "grad_norm": 4.8647035, "learning_rate": 9.2e-07, "token_acc": 0.80546129, "epoch": 0.22435897, "global_step/max_steps": "35/156", "percentage": "22.44%", "elapsed_time": "47m 25s", "remaining_time": "2h 43m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012301}
+{"loss": 0.63011646, "grad_norm": 5.20984227, "learning_rate": 9.1e-07, "token_acc": 0.81981981, "epoch": 0.23076923, "global_step/max_steps": "36/156", "percentage": "23.08%", "elapsed_time": "48m 54s", "remaining_time": "2h 43m 2s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
+{"loss": 0.64742309, "grad_norm": 5.06085511, "learning_rate": 9.1e-07, "token_acc": 0.8159399, "epoch": 0.23717949, "global_step/max_steps": "37/156", "percentage": "23.72%", "elapsed_time": "50m 11s", "remaining_time": "2h 41m 25s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012287}
+{"loss": 0.69717383, "grad_norm": 4.61811775, "learning_rate": 9e-07, "token_acc": 0.80520284, "epoch": 0.24358974, "global_step/max_steps": "38/156", "percentage": "24.36%", "elapsed_time": "51m 43s", "remaining_time": "2h 40m 37s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012243}
+{"loss": 0.62579024, "grad_norm": 4.66712972, "learning_rate": 9e-07, "token_acc": 0.81843996, "epoch": 0.25, "global_step/max_steps": "39/156", "percentage": "25.00%", "elapsed_time": "53m 3s", "remaining_time": "2h 39m 10s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012251}
+{"loss": 0.65105742, "grad_norm": 4.75867456, "learning_rate": 8.9e-07, "token_acc": 0.81644934, "epoch": 0.25641026, "global_step/max_steps": "40/156", "percentage": "25.64%", "elapsed_time": "54m 22s", "remaining_time": "2h 37m 40s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012261}
+{"loss": 0.63637537, "grad_norm": 4.88613943, "learning_rate": 8.8e-07, "token_acc": 0.81838524, "epoch": 0.26282051, "global_step/max_steps": "41/156", "percentage": "26.28%", "elapsed_time": "55m 40s", "remaining_time": "2h 36m 8s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012275}
+{"loss": 0.65643597, "grad_norm": 4.73657968, "learning_rate": 8.8e-07, "token_acc": 0.81969416, "epoch": 0.26923077, "global_step/max_steps": "42/156", "percentage": "26.92%", "elapsed_time": "56m 50s", "remaining_time": "2h 34m 18s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012313}
+{"loss": 0.63899696, "grad_norm": 4.55758133, "learning_rate": 8.7e-07, "token_acc": 0.81999677, "epoch": 0.27564103, "global_step/max_steps": "43/156", "percentage": "27.56%", "elapsed_time": "58m 12s", "remaining_time": "2h 32m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012314}
+{"loss": 0.63628721, "grad_norm": 4.99415384, "learning_rate": 8.6e-07, "token_acc": 0.8194927, "epoch": 0.28205128, "global_step/max_steps": "44/156", "percentage": "28.21%", "elapsed_time": "59m 35s", "remaining_time": "2h 31m 40s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012306}
+{"loss": 0.63944864, "grad_norm": 4.54901661, "learning_rate": 8.5e-07, "token_acc": 0.81742072, "epoch": 0.28846154, "global_step/max_steps": "45/156", "percentage": "28.85%", "elapsed_time": "1h 1m 0s", "remaining_time": "2h 30m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012295}
+{"loss": 0.61701751, "grad_norm": 4.76973775, "learning_rate": 8.5e-07, "token_acc": 0.82378399, "epoch": 0.29487179, "global_step/max_steps": "46/156", "percentage": "29.49%", "elapsed_time": "1h 2m 17s", "remaining_time": "2h 28m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012309}
+{"loss": 0.59330505, "grad_norm": 4.68095775, "learning_rate": 8.4e-07, "token_acc": 0.82901824, "epoch": 0.30128205, "global_step/max_steps": "47/156", "percentage": "30.13%", "elapsed_time": "1h 3m 28s", "remaining_time": "2h 27m 11s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012342}
+{"loss": 0.60955942, "grad_norm": 4.52158153, "learning_rate": 8.3e-07, "token_acc": 0.82506776, "epoch": 0.30769231, "global_step/max_steps": "48/156", "percentage": "30.77%", "elapsed_time": "1h 4m 39s", "remaining_time": "2h 25m 29s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012372}
+{"loss": 0.58634937, "grad_norm": 4.81313193, "learning_rate": 8.2e-07, "token_acc": 0.83429396, "epoch": 0.31410256, "global_step/max_steps": "49/156", "percentage": "31.41%", "elapsed_time": "1h 6m 2s", "remaining_time": "2h 24m 12s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012366}
+{"loss": 0.62977922, "grad_norm": 4.87404017, "learning_rate": 8.1e-07, "token_acc": 0.8187986, "epoch": 0.32051282, "global_step/max_steps": "50/156", "percentage": "32.05%", "elapsed_time": "1h 7m 14s", "remaining_time": "2h 22m 32s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012395}
+{"loss": 0.59493053, "grad_norm": 4.94227702, "learning_rate": 8.1e-07, "token_acc": 0.83116883, "epoch": 0.32692308, "global_step/max_steps": "51/156", "percentage": "32.69%", "elapsed_time": "1h 8m 33s", "remaining_time": "2h 21m 8s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012399}
+{"loss": 0.57395041, "grad_norm": 4.39539597, "learning_rate": 8e-07, "token_acc": 0.83097446, "epoch": 0.33333333, "global_step/max_steps": "52/156", "percentage": "33.33%", "elapsed_time": "1h 10m 0s", "remaining_time": "2h 20m 0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01238}
+{"loss": 0.61727303, "grad_norm": 4.73591311, "learning_rate": 7.9e-07, "token_acc": 0.82129598, "epoch": 0.33974359, "global_step/max_steps": "53/156", "percentage": "33.97%", "elapsed_time": "1h 11m 36s", "remaining_time": "2h 19m 9s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012335}
+{"loss": 0.55737722, "grad_norm": 4.37825661, "learning_rate": 7.8e-07, "token_acc": 0.84513646, "epoch": 0.34615385, "global_step/max_steps": "54/156", "percentage": "34.62%", "elapsed_time": "1h 12m 46s", "remaining_time": "2h 17m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012366}
+{"loss": 0.57170796, "grad_norm": 4.68421898, "learning_rate": 7.7e-07, "token_acc": 0.83828151, "epoch": 0.3525641, "global_step/max_steps": "55/156", "percentage": "35.26%", "elapsed_time": "1h 14m 1s", "remaining_time": "2h 15m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012383}
+{"loss": 0.59979248, "grad_norm": 4.42801189, "learning_rate": 7.6e-07, "token_acc": 0.8215729, "epoch": 0.35897436, "global_step/max_steps": "56/156", "percentage": "35.90%", "elapsed_time": "1h 15m 27s", "remaining_time": "2h 14m 45s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012369}
+{"loss": 0.55416632, "grad_norm": 4.82672986, "learning_rate": 7.5e-07, "token_acc": 0.8405695, "epoch": 0.36538462, "global_step/max_steps": "57/156", "percentage": "36.54%", "elapsed_time": "1h 16m 41s", "remaining_time": "2h 13m 11s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
+{"loss": 0.57087392, "grad_norm": 4.35926386, "learning_rate": 7.4e-07, "token_acc": 0.83499038, "epoch": 0.37179487, "global_step/max_steps": "58/156", "percentage": "37.18%", "elapsed_time": "1h 18m 10s", "remaining_time": "2h 12m 5s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012365}
+{"loss": 0.54257536, "grad_norm": 4.82886644, "learning_rate": 7.3e-07, "token_acc": 0.83735251, "epoch": 0.37820513, "global_step/max_steps": "59/156", "percentage": "37.82%", "elapsed_time": "1h 19m 24s", "remaining_time": "2h 10m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012382}
+{"loss": 0.56238914, "grad_norm": 4.59861324, "learning_rate": 7.3e-07, "token_acc": 0.83281517, "epoch": 0.38461538, "global_step/max_steps": "60/156", "percentage": "38.46%", "elapsed_time": "1h 20m 50s", "remaining_time": "2h 9m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01237}
+{"loss": 0.57429522, "grad_norm": 4.84046203, "learning_rate": 7.2e-07, "token_acc": 0.82906735, "epoch": 0.39102564, "global_step/max_steps": "61/156", "percentage": "39.10%", "elapsed_time": "1h 22m 14s", "remaining_time": "2h 8m 4s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012362}
+{"loss": 0.58638889, "grad_norm": 4.23328234, "learning_rate": 7.1e-07, "token_acc": 0.82650191, "epoch": 0.3974359, "global_step/max_steps": "62/156", "percentage": "39.74%", "elapsed_time": "1h 23m 45s", "remaining_time": "2h 6m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012336}
+{"loss": 0.53687143, "grad_norm": 4.39388537, "learning_rate": 7e-07, "token_acc": 0.84317684, "epoch": 0.40384615, "global_step/max_steps": "63/156", "percentage": "40.38%", "elapsed_time": "1h 25m 19s", "remaining_time": "2h 5m 56s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012307}
+{"loss": 0.60582626, "grad_norm": 4.66253018, "learning_rate": 6.9e-07, "token_acc": 0.82148314, "epoch": 0.41025641, "global_step/max_steps": "64/156", "percentage": "41.03%", "elapsed_time": "1h 26m 44s", "remaining_time": "2h 4m 41s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012297}
+{"loss": 0.55621356, "grad_norm": 4.558812, "learning_rate": 6.8e-07, "token_acc": 0.83885592, "epoch": 0.41666667, "global_step/max_steps": "65/156", "percentage": "41.67%", "elapsed_time": "1h 28m 15s", "remaining_time": "2h 3m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012275}
+{"loss": 0.56102502, "grad_norm": 4.76539571, "learning_rate": 6.7e-07, "token_acc": 0.83055019, "epoch": 0.42307692, "global_step/max_steps": "66/156", "percentage": "42.31%", "elapsed_time": "1h 29m 28s", "remaining_time": "2h 2m 1s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012293}
+{"loss": 0.54189265, "grad_norm": 4.56774461, "learning_rate": 6.6e-07, "token_acc": 0.84312022, "epoch": 0.42948718, "global_step/max_steps": "67/156", "percentage": "42.95%", "elapsed_time": "1h 30m 56s", "remaining_time": "2h 0m 48s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012278}
+{"loss": 0.55337489, "grad_norm": 4.52994682, "learning_rate": 6.5e-07, "token_acc": 0.83290797, "epoch": 0.43589744, "global_step/max_steps": "68/156", "percentage": "43.59%", "elapsed_time": "1h 32m 17s", "remaining_time": "1h 59m 25s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01228}
+{"loss": 0.5477522, "grad_norm": 4.45722127, "learning_rate": 6.4e-07, "token_acc": 0.8378194, "epoch": 0.44230769, "global_step/max_steps": "69/156", "percentage": "44.23%", "elapsed_time": "1h 33m 57s", "remaining_time": "1h 58m 27s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01224}
+{"loss": 0.55349118, "grad_norm": 4.61153178, "learning_rate": 6.3e-07, "token_acc": 0.8411966, "epoch": 0.44871795, "global_step/max_steps": "70/156", "percentage": "44.87%", "elapsed_time": "1h 35m 20s", "remaining_time": "1h 57m 7s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012237}
+{"loss": 0.53457195, "grad_norm": 4.41866199, "learning_rate": 6.2e-07, "token_acc": 0.83968687, "epoch": 0.45512821, "global_step/max_steps": "71/156", "percentage": "45.51%", "elapsed_time": "1h 36m 36s", "remaining_time": "1h 55m 39s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012248}
+{"loss": 0.57335734, "grad_norm": 4.43861453, "learning_rate": 6.1e-07, "token_acc": 0.83286119, "epoch": 0.46153846, "global_step/max_steps": "72/156", "percentage": "46.15%", "elapsed_time": "1h 37m 51s", "remaining_time": "1h 54m 10s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012262}
+{"loss": 0.54812038, "grad_norm": 4.30486017, "learning_rate": 5.9e-07, "token_acc": 0.83778405, "epoch": 0.46794872, "global_step/max_steps": "73/156", "percentage": "46.79%", "elapsed_time": "1h 39m 10s", "remaining_time": "1h 52m 46s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
+{"loss": 0.55430961, "grad_norm": 4.1943194, "learning_rate": 5.8e-07, "token_acc": 0.83625638, "epoch": 0.47435897, "global_step/max_steps": "74/156", "percentage": "47.44%", "elapsed_time": "1h 40m 28s", "remaining_time": "1h 51m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012275}
+{"loss": 0.57966816, "grad_norm": 4.62980085, "learning_rate": 5.7e-07, "token_acc": 0.83285105, "epoch": 0.48076923, "global_step/max_steps": "75/156", "percentage": "48.08%", "elapsed_time": "1h 41m 40s", "remaining_time": "1h 49m 48s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012294}
+{"loss": 0.54380375, "grad_norm": 4.35547228, "learning_rate": 5.6e-07, "token_acc": 0.84344262, "epoch": 0.48717949, "global_step/max_steps": "76/156", "percentage": "48.72%", "elapsed_time": "1h 42m 50s", "remaining_time": "1h 48m 15s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012317}
+{"loss": 0.53343827, "grad_norm": 4.62774062, "learning_rate": 5.5e-07, "token_acc": 0.84348905, "epoch": 0.49358974, "global_step/max_steps": "77/156", "percentage": "49.36%", "elapsed_time": "1h 44m 42s", "remaining_time": "1h 47m 26s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012256}
+{"loss": 0.53137887, "grad_norm": 4.27804086, "learning_rate": 5.4e-07, "token_acc": 0.84490591, "epoch": 0.5, "global_step/max_steps": "78/156", "percentage": "50.00%", "elapsed_time": "1h 45m 58s", "remaining_time": "1h 45m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
+{"loss": 0.52323365, "grad_norm": 4.09285521, "learning_rate": 5.3e-07, "token_acc": 0.8468861, "epoch": 0.50641026, "global_step/max_steps": "79/156", "percentage": "50.64%", "elapsed_time": "1h 47m 18s", "remaining_time": "1h 44m 35s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012269}
+{"loss": 0.54979551, "grad_norm": 4.42899359, "learning_rate": 5.2e-07, "token_acc": 0.84460127, "epoch": 0.51282051, "global_step/max_steps": "80/156", "percentage": "51.28%", "elapsed_time": "1h 48m 49s", "remaining_time": "1h 43m 23s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012252}
+{"loss": 0.54692674, "grad_norm": 4.21577617, "learning_rate": 5.1e-07, "token_acc": 0.83669257, "epoch": 0.51923077, "global_step/max_steps": "81/156", "percentage": "51.92%", "elapsed_time": "1h 50m 3s", "remaining_time": "1h 41m 54s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012267}
+{"loss": 0.52708614, "grad_norm": 4.43886275, "learning_rate": 5e-07, "token_acc": 0.83714193, "epoch": 0.52564103, "global_step/max_steps": "82/156", "percentage": "52.56%", "elapsed_time": "1h 51m 11s", "remaining_time": "1h 40m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012292}
+{"loss": 0.53395653, "grad_norm": 4.17682636, "learning_rate": 4.9e-07, "token_acc": 0.84171021, "epoch": 0.53205128, "global_step/max_steps": "83/156", "percentage": "53.21%", "elapsed_time": "1h 52m 30s", "remaining_time": "1h 38m 57s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012295}
+{"loss": 0.54247582, "grad_norm": 4.53019853, "learning_rate": 4.8e-07, "token_acc": 0.83958578, "epoch": 0.53846154, "global_step/max_steps": "84/156", "percentage": "53.85%", "elapsed_time": "1h 53m 51s", "remaining_time": "1h 37m 35s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012296}
+{"loss": 0.52001339, "grad_norm": 4.37050988, "learning_rate": 4.7e-07, "token_acc": 0.84339178, "epoch": 0.54487179, "global_step/max_steps": "85/156", "percentage": "54.49%", "elapsed_time": "1h 55m 46s", "remaining_time": "1h 36m 42s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012236}
+{"loss": 0.54800576, "grad_norm": 4.38290539, "learning_rate": 4.6e-07, "token_acc": 0.83976507, "epoch": 0.55128205, "global_step/max_steps": "86/156", "percentage": "55.13%", "elapsed_time": "1h 57m 13s", "remaining_time": "1h 35m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012227}
+{"loss": 0.52284074, "grad_norm": 4.34244252, "learning_rate": 4.5e-07, "token_acc": 0.85395187, "epoch": 0.55769231, "global_step/max_steps": "87/156", "percentage": "55.77%", "elapsed_time": "1h 58m 37s", "remaining_time": "1h 34m 4s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012223}
+{"loss": 0.54131281, "grad_norm": 4.13734102, "learning_rate": 4.4e-07, "token_acc": 0.83685899, "epoch": 0.56410256, "global_step/max_steps": "88/156", "percentage": "56.41%", "elapsed_time": "1h 59m 48s", "remaining_time": "1h 32m 34s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012242}
+{"loss": 0.51603007, "grad_norm": 4.36974965, "learning_rate": 4.3e-07, "token_acc": 0.84785974, "epoch": 0.57051282, "global_step/max_steps": "89/156", "percentage": "57.05%", "elapsed_time": "2h 1m 17s", "remaining_time": "1h 31m 18s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01223}
+{"loss": 0.53859156, "grad_norm": 4.56841423, "learning_rate": 4.2e-07, "token_acc": 0.84574932, "epoch": 0.57692308, "global_step/max_steps": "90/156", "percentage": "57.69%", "elapsed_time": "2h 2m 41s", "remaining_time": "1h 29m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012225}
+{"loss": 0.52594697, "grad_norm": 4.26307779, "learning_rate": 4.1e-07, "token_acc": 0.8433637, "epoch": 0.58333333, "global_step/max_steps": "91/156", "percentage": "58.33%", "elapsed_time": "2h 3m 45s", "remaining_time": "1h 28m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012255}
+{"loss": 0.52416551, "grad_norm": 4.24665846, "learning_rate": 3.9e-07, "token_acc": 0.84782606, "epoch": 0.58974359, "global_step/max_steps": "92/156", "percentage": "58.97%", "elapsed_time": "2h 5m 11s", "remaining_time": "1h 27m 5s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012247}
+{"loss": 0.54643536, "grad_norm": 4.42645624, "learning_rate": 3.8e-07, "token_acc": 0.83482218, "epoch": 0.59615385, "global_step/max_steps": "93/156", "percentage": "59.62%", "elapsed_time": "2h 6m 45s", "remaining_time": "1h 25m 52s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012228}
+{"loss": 0.53462034, "grad_norm": 4.57135257, "learning_rate": 3.7e-07, "token_acc": 0.84266156, "epoch": 0.6025641, "global_step/max_steps": "94/156", "percentage": "60.26%", "elapsed_time": "2h 7m 59s", "remaining_time": "1h 24m 25s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01224}
+{"loss": 0.4898929, "grad_norm": 4.30288369, "learning_rate": 3.6e-07, "token_acc": 0.85457271, "epoch": 0.60897436, "global_step/max_steps": "95/156", "percentage": "60.90%", "elapsed_time": "2h 9m 12s", "remaining_time": "1h 22m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012253}
+{"loss": 0.55974352, "grad_norm": 4.48851467, "learning_rate": 3.5e-07, "token_acc": 0.84236282, "epoch": 0.61538462, "global_step/max_steps": "96/156", "percentage": "61.54%", "elapsed_time": "2h 10m 30s", "remaining_time": "1h 21m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01226}
+{"loss": 0.59083301, "grad_norm": 4.50473731, "learning_rate": 3.4e-07, "token_acc": 0.83140576, "epoch": 0.62179487, "global_step/max_steps": "97/156", "percentage": "62.18%", "elapsed_time": "2h 11m 52s", "remaining_time": "1h 20m 12s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012259}
+{"loss": 0.54224521, "grad_norm": 4.32698999, "learning_rate": 3.3e-07, "token_acc": 0.84555858, "epoch": 0.62820513, "global_step/max_steps": "98/156", "percentage": "62.82%", "elapsed_time": "2h 13m 7s", "remaining_time": "1h 18m 47s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012269}
+{"loss": 0.4947038, "grad_norm": 4.0718176, "learning_rate": 3.2e-07, "token_acc": 0.85249788, "epoch": 0.63461538, "global_step/max_steps": "99/156", "percentage": "63.46%", "elapsed_time": "2h 14m 18s", "remaining_time": "1h 17m 19s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012285}
+{"loss": 0.48203486, "grad_norm": 4.36429258, "learning_rate": 3.1e-07, "token_acc": 0.85731459, "epoch": 0.64102564, "global_step/max_steps": "100/156", "percentage": "64.10%", "elapsed_time": "2h 15m 42s", "remaining_time": "1h 15m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012281}
+{"loss": 0.57775241, "grad_norm": 4.24510136, "learning_rate": 3e-07, "token_acc": 0.83192444, "epoch": 0.6474359, "global_step/max_steps": "101/156", "percentage": "64.74%", "elapsed_time": "2h 16m 51s", "remaining_time": "1h 14m 31s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012299}
+{"loss": 0.51172423, "grad_norm": 4.34557783, "learning_rate": 2.9e-07, "token_acc": 0.85045135, "epoch": 0.65384615, "global_step/max_steps": "102/156", "percentage": "65.38%", "elapsed_time": "2h 18m 14s", "remaining_time": "1h 13m 11s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012298}
+{"loss": 0.59052503, "grad_norm": 4.35483555, "learning_rate": 2.8e-07, "token_acc": 0.82920426, "epoch": 0.66025641, "global_step/max_steps": "103/156", "percentage": "66.03%", "elapsed_time": "2h 19m 34s", "remaining_time": "1h 11m 49s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012299}
+{"loss": 0.53812402, "grad_norm": 4.37309245, "learning_rate": 2.7e-07, "token_acc": 0.83758545, "epoch": 0.66666667, "global_step/max_steps": "104/156", "percentage": "66.67%", "elapsed_time": "2h 20m 48s", "remaining_time": "1h 10m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01231}
+{"loss": 0.51737601, "grad_norm": 4.15818775, "learning_rate": 2.7e-07, "token_acc": 0.84191114, "epoch": 0.67307692, "global_step/max_steps": "105/156", "percentage": "67.31%", "elapsed_time": "2h 22m 1s", "remaining_time": "1h 8m 59s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012322}
+{"loss": 0.52918124, "grad_norm": 4.36027095, "learning_rate": 2.6e-07, "token_acc": 0.84818482, "epoch": 0.67948718, "global_step/max_steps": "106/156", "percentage": "67.95%", "elapsed_time": "2h 23m 15s", "remaining_time": "1h 7m 34s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012332}
+{"loss": 0.54951406, "grad_norm": 4.39465482, "learning_rate": 2.5e-07, "token_acc": 0.83936614, "epoch": 0.68589744, "global_step/max_steps": "107/156", "percentage": "68.59%", "elapsed_time": "2h 24m 37s", "remaining_time": "1h 6m 13s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01233}
+{"loss": 0.52851987, "grad_norm": 4.48237752, "learning_rate": 2.4e-07, "token_acc": 0.84538782, "epoch": 0.69230769, "global_step/max_steps": "108/156", "percentage": "69.23%", "elapsed_time": "2h 25m 51s", "remaining_time": "1h 4m 49s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012341}
+{"loss": 0.49810505, "grad_norm": 4.21900446, "learning_rate": 2.3e-07, "token_acc": 0.84801209, "epoch": 0.69871795, "global_step/max_steps": "109/156", "percentage": "69.87%", "elapsed_time": "2h 27m 11s", "remaining_time": "1h 3m 27s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012343}
+{"loss": 0.5449968, "grad_norm": 4.52981291, "learning_rate": 2.2e-07, "token_acc": 0.83842641, "epoch": 0.70512821, "global_step/max_steps": "110/156", "percentage": "70.51%", "elapsed_time": "2h 28m 47s", "remaining_time": "1h 2m 13s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012321}
+{"loss": 0.53004676, "grad_norm": 4.26468111, "learning_rate": 2.1e-07, "token_acc": 0.84083682, "epoch": 0.71153846, "global_step/max_steps": "111/156", "percentage": "71.15%", "elapsed_time": "2h 30m 3s", "remaining_time": "1h 0m 50s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012328}
+{"loss": 0.55461526, "grad_norm": 4.2244321, "learning_rate": 2e-07, "token_acc": 0.83352208, "epoch": 0.71794872, "global_step/max_steps": "112/156", "percentage": "71.79%", "elapsed_time": "2h 31m 22s", "remaining_time": "59m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012332}
+{"loss": 0.59113604, "grad_norm": 4.40275179, "learning_rate": 1.9e-07, "token_acc": 0.82692927, "epoch": 0.72435897, "global_step/max_steps": "113/156", "percentage": "72.44%", "elapsed_time": "2h 32m 49s", "remaining_time": "58m 9s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012323}
+{"loss": 0.54400551, "grad_norm": 4.43860454, "learning_rate": 1.9e-07, "token_acc": 0.84398371, "epoch": 0.73076923, "global_step/max_steps": "114/156", "percentage": "73.08%", "elapsed_time": "2h 33m 57s", "remaining_time": "56m 43s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012341}
+{"loss": 0.52488923, "grad_norm": 4.33736014, "learning_rate": 1.8e-07, "token_acc": 0.84285957, "epoch": 0.73717949, "global_step/max_steps": "115/156", "percentage": "73.72%", "elapsed_time": "2h 35m 14s", "remaining_time": "55m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012346}
+{"loss": 0.50442624, "grad_norm": 3.97877867, "learning_rate": 1.7e-07, "token_acc": 0.85209531, "epoch": 0.74358974, "global_step/max_steps": "116/156", "percentage": "74.36%", "elapsed_time": "2h 36m 31s", "remaining_time": "53m 58s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012352}
+{"loss": 0.52389383, "grad_norm": 4.20646601, "learning_rate": 1.6e-07, "token_acc": 0.85036618, "epoch": 0.75, "global_step/max_steps": "117/156", "percentage": "75.00%", "elapsed_time": "2h 37m 50s", "remaining_time": "52m 36s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012354}
+{"loss": 0.48032126, "grad_norm": 4.11543544, "learning_rate": 1.5e-07, "token_acc": 0.86102718, "epoch": 0.75641026, "global_step/max_steps": "118/156", "percentage": "75.64%", "elapsed_time": "2h 39m 10s", "remaining_time": "51m 15s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012356}
+{"loss": 0.50924706, "grad_norm": 4.30736574, "learning_rate": 1.5e-07, "token_acc": 0.84713054, "epoch": 0.76282051, "global_step/max_steps": "119/156", "percentage": "76.28%", "elapsed_time": "2h 40m 30s", "remaining_time": "49m 54s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012356}
+{"loss": 0.5285455, "grad_norm": 4.2289696, "learning_rate": 1.4e-07, "token_acc": 0.84022439, "epoch": 0.76923077, "global_step/max_steps": "120/156", "percentage": "76.92%", "elapsed_time": "2h 42m 4s", "remaining_time": "48m 37s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01234}
+{"loss": 0.51606965, "grad_norm": 4.31743708, "learning_rate": 1.3e-07, "token_acc": 0.84314388, "epoch": 0.77564103, "global_step/max_steps": "121/156", "percentage": "77.56%", "elapsed_time": "2h 43m 19s", "remaining_time": "47m 14s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012348}
+{"loss": 0.58972692, "grad_norm": 4.20623043, "learning_rate": 1.2e-07, "token_acc": 0.82227379, "epoch": 0.78205128, "global_step/max_steps": "122/156", "percentage": "78.21%", "elapsed_time": "2h 44m 47s", "remaining_time": "45m 55s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012339}
+{"loss": 0.52436459, "grad_norm": 4.53727067, "learning_rate": 1.2e-07, "token_acc": 0.84348273, "epoch": 0.78846154, "global_step/max_steps": "123/156", "percentage": "78.85%", "elapsed_time": "2h 46m 4s", "remaining_time": "44m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012344}
+{"loss": 0.53095937, "grad_norm": 4.3389721, "learning_rate": 1.1e-07, "token_acc": 0.84100276, "epoch": 0.79487179, "global_step/max_steps": "124/156", "percentage": "79.49%", "elapsed_time": "2h 47m 31s", "remaining_time": "43m 14s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012336}
+{"loss": 0.51528633, "grad_norm": 4.17226894, "learning_rate": 1e-07, "token_acc": 0.8451646, "epoch": 0.80128205, "global_step/max_steps": "125/156", "percentage": "80.13%", "elapsed_time": "2h 48m 49s", "remaining_time": "41m 52s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012341}
+{"loss": 0.53967261, "grad_norm": 4.25014974, "learning_rate": 1e-07, "token_acc": 0.84366947, "epoch": 0.80769231, "global_step/max_steps": "126/156", "percentage": "80.77%", "elapsed_time": "2h 50m 7s", "remaining_time": "40m 30s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012344}
+{"loss": 0.52977896, "grad_norm": 4.40560735, "learning_rate": 9e-08, "token_acc": 0.83746511, "epoch": 0.81410256, "global_step/max_steps": "127/156", "percentage": "81.41%", "elapsed_time": "2h 51m 17s", "remaining_time": "39m 6s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012357}
+{"loss": 0.47736442, "grad_norm": 4.25610765, "learning_rate": 9e-08, "token_acc": 0.85747331, "epoch": 0.82051282, "global_step/max_steps": "128/156", "percentage": "82.05%", "elapsed_time": "2h 52m 29s", "remaining_time": "37m 44s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012367}
+{"loss": 0.50190258, "grad_norm": 4.40234549, "learning_rate": 8e-08, "token_acc": 0.84879065, "epoch": 0.82692308, "global_step/max_steps": "129/156", "percentage": "82.69%", "elapsed_time": "2h 53m 44s", "remaining_time": "36m 21s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012374}
+{"loss": 0.50948274, "grad_norm": 4.22299164, "learning_rate": 7e-08, "token_acc": 0.84777558, "epoch": 0.83333333, "global_step/max_steps": "130/156", "percentage": "83.33%", "elapsed_time": "2h 55m 10s", "remaining_time": "35m 2s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012369}
+{"loss": 0.55336982, "grad_norm": 4.33197224, "learning_rate": 7e-08, "token_acc": 0.83738577, "epoch": 0.83974359, "global_step/max_steps": "131/156", "percentage": "83.97%", "elapsed_time": "2h 56m 28s", "remaining_time": "33m 40s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012372}
+{"loss": 0.50672126, "grad_norm": 4.0856793, "learning_rate": 6e-08, "token_acc": 0.85186362, "epoch": 0.84615385, "global_step/max_steps": "132/156", "percentage": "84.62%", "elapsed_time": "2h 57m 38s", "remaining_time": "32m 17s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012385}
+{"loss": 0.49411821, "grad_norm": 4.27345374, "learning_rate": 6e-08, "token_acc": 0.84789586, "epoch": 0.8525641, "global_step/max_steps": "133/156", "percentage": "85.26%", "elapsed_time": "2h 58m 47s", "remaining_time": "30m 55s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012398}
+{"loss": 0.55297947, "grad_norm": 4.26234047, "learning_rate": 5e-08, "token_acc": 0.8371467, "epoch": 0.85897436, "global_step/max_steps": "134/156", "percentage": "85.90%", "elapsed_time": "3h 0m 11s", "remaining_time": "29m 34s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012394}
+{"loss": 0.5094257, "grad_norm": 4.38213946, "learning_rate": 5e-08, "token_acc": 0.85126138, "epoch": 0.86538462, "global_step/max_steps": "135/156", "percentage": "86.54%", "elapsed_time": "3h 1m 47s", "remaining_time": "28m 16s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012377}
+{"loss": 0.52112681, "grad_norm": 4.62171216, "learning_rate": 4e-08, "token_acc": 0.84375, "epoch": 0.87179487, "global_step/max_steps": "136/156", "percentage": "87.18%", "elapsed_time": "3h 3m 3s", "remaining_time": "26m 55s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012382}
+{"loss": 0.49088389, "grad_norm": 4.20408874, "learning_rate": 4e-08, "token_acc": 0.85649776, "epoch": 0.87820513, "global_step/max_steps": "137/156", "percentage": "87.82%", "elapsed_time": "3h 4m 18s", "remaining_time": "25m 33s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
+{"loss": 0.50413823, "grad_norm": 4.37768117, "learning_rate": 4e-08, "token_acc": 0.8541953, "epoch": 0.88461538, "global_step/max_steps": "138/156", "percentage": "88.46%", "elapsed_time": "3h 5m 40s", "remaining_time": "24m 13s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
+{"loss": 0.54804325, "grad_norm": 4.32917767, "learning_rate": 3e-08, "token_acc": 0.83703583, "epoch": 0.89102564, "global_step/max_steps": "139/156", "percentage": "89.10%", "elapsed_time": "3h 7m 0s", "remaining_time": "22m 52s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012388}
+{"loss": 0.54084694, "grad_norm": 4.19027868, "learning_rate": 3e-08, "token_acc": 0.84083045, "epoch": 0.8974359, "global_step/max_steps": "140/156", "percentage": "89.74%", "elapsed_time": "3h 8m 17s", "remaining_time": "21m 31s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012392}
+{"loss": 0.52507019, "grad_norm": 4.42979992, "learning_rate": 3e-08, "token_acc": 0.84048975, "epoch": 0.90384615, "global_step/max_steps": "141/156", "percentage": "90.38%", "elapsed_time": "3h 9m 34s", "remaining_time": "20m 10s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012396}
+{"loss": 0.49129876, "grad_norm": 4.24055645, "learning_rate": 2e-08, "token_acc": 0.85284394, "epoch": 0.91025641, "global_step/max_steps": "142/156", "percentage": "91.03%", "elapsed_time": "3h 10m 56s", "remaining_time": "18m 49s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012395}
+{"loss": 0.51070774, "grad_norm": 4.0944321, "learning_rate": 2e-08, "token_acc": 0.84804004, "epoch": 0.91666667, "global_step/max_steps": "143/156", "percentage": "91.67%", "elapsed_time": "3h 12m 10s", "remaining_time": "17m 28s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012401}
+{"loss": 0.48216087, "grad_norm": 4.09610548, "learning_rate": 2e-08, "token_acc": 0.85173351, "epoch": 0.92307692, "global_step/max_steps": "144/156", "percentage": "92.31%", "elapsed_time": "3h 13m 37s", "remaining_time": "16m 8s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012395}
+{"loss": 0.48092702, "grad_norm": 3.99988943, "learning_rate": 1e-08, "token_acc": 0.85898077, "epoch": 0.92948718, "global_step/max_steps": "145/156", "percentage": "92.95%", "elapsed_time": "3h 15m 2s", "remaining_time": "14m 47s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012391}
+{"loss": 0.47554287, "grad_norm": 4.31274229, "learning_rate": 1e-08, "token_acc": 0.86201912, "epoch": 0.93589744, "global_step/max_steps": "146/156", "percentage": "93.59%", "elapsed_time": "3h 16m 24s", "remaining_time": "13m 27s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012389}
+{"loss": 0.53879178, "grad_norm": 4.53572821, "learning_rate": 1e-08, "token_acc": 0.83533359, "epoch": 0.94230769, "global_step/max_steps": "147/156", "percentage": "94.23%", "elapsed_time": "3h 17m 41s", "remaining_time": "12m 6s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012393}
+{"loss": 0.48995703, "grad_norm": 4.16794684, "learning_rate": 1e-08, "token_acc": 0.85930586, "epoch": 0.94871795, "global_step/max_steps": "148/156", "percentage": "94.87%", "elapsed_time": "3h 19m 1s", "remaining_time": "10m 45s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012394}
+{"loss": 0.54540122, "grad_norm": 4.23517781, "learning_rate": 1e-08, "token_acc": 0.84179008, "epoch": 0.95512821, "global_step/max_steps": "149/156", "percentage": "95.51%", "elapsed_time": "3h 20m 17s", "remaining_time": "9m 24s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012399}
+{"loss": 0.52318221, "grad_norm": 4.2966771, "learning_rate": 0.0, "token_acc": 0.84380364, "epoch": 0.96153846, "global_step/max_steps": "150/156", "percentage": "96.15%", "elapsed_time": "3h 21m 43s", "remaining_time": "8m 4s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012394}
+{"loss": 0.51414204, "grad_norm": 4.32346855, "learning_rate": 0.0, "token_acc": 0.84598291, "epoch": 0.96794872, "global_step/max_steps": "151/156", "percentage": "96.79%", "elapsed_time": "3h 22m 52s", "remaining_time": "6m 43s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012405}
+{"loss": 0.53818375, "grad_norm": 4.28618818, "learning_rate": 0.0, "token_acc": 0.83853394, "epoch": 0.97435897, "global_step/max_steps": "152/156", "percentage": "97.44%", "elapsed_time": "3h 24m 8s", "remaining_time": "5m 22s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.01241}
+{"loss": 0.51649928, "grad_norm": 4.39523061, "learning_rate": 0.0, "token_acc": 0.8487395, "epoch": 0.98076923, "global_step/max_steps": "153/156", "percentage": "98.08%", "elapsed_time": "3h 25m 33s", "remaining_time": "4m 1s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012406}
+{"loss": 0.54061735, "grad_norm": 4.38216797, "learning_rate": 0.0, "token_acc": 0.83955288, "epoch": 0.98717949, "global_step/max_steps": "154/156", "percentage": "98.72%", "elapsed_time": "3h 26m 46s", "remaining_time": "2m 41s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012413}
+{"loss": 0.50758892, "grad_norm": 4.21818431, "learning_rate": 0.0, "token_acc": 0.84441513, "epoch": 0.99358974, "global_step/max_steps": "155/156", "percentage": "99.36%", "elapsed_time": "3h 28m 22s", "remaining_time": "1m 20s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012398}
+{"loss": 0.54625416, "grad_norm": 4.95829896, "learning_rate": 0.0, "token_acc": 0.83869255, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 29m 15s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012425}
+{"eval_loss": 0.55147469, "eval_runtime": 10.0275, "eval_samples_per_second": 0.898, "eval_steps_per_second": 0.199, "eval_token_acc": 0.84815109, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 29m 25s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012415}
+{"eval_loss": 0.55147469, "eval_runtime": 12.1526, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.165, "eval_token_acc": 0.84815109, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 33m 23s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.012184}
+{"train_runtime": 13045.8923, "train_samples_per_second": 0.763, "train_steps_per_second": 0.012, "total_flos": 69981770547200.0, "train_loss": 0.63741068, "epoch": 1.0, "global_step/max_steps": "156/156", "percentage": "100.00%", "elapsed_time": "3h 37m 24s", "remaining_time": "0s", "memory(GiB)": 92.6, "train_speed(iter/s)": 0.011959}
+{"train_dataset": "2651.186647±828.544792, min=980.000000, max=7751.000000, size=9960", "val_dataset": "2758.222222±600.102543, min=2032.000000, max=3684.000000, size=9", "model_parameter_info": "Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.", "last_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/checkpoint-156", "best_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v1-20250820-211253/checkpoint-156", "best_metric": 0.55147469, "global_step": 156, "log_history": [{"loss": 1.3332561254501343, "grad_norm": 204.5527273204477, "learning_rate": 1.25e-07, "token_acc": 0.7094771265983582, "epoch": 0.00641025641025641, "step": 1}, {"loss": 1.2919142246246338, "grad_norm": 472.01805132730743, "learning_rate": 2.5e-07, "token_acc": 0.7154364585876465, "epoch": 0.01282051282051282, "step": 2}, {"loss": 1.3103106021881104, "grad_norm": 295.9011538877219, "learning_rate": 3.75e-07, "token_acc": 0.7126436829566956, "epoch": 0.019230769230769232, "step": 3}, {"loss": 1.356910228729248, "grad_norm": 420.5023387419371, "learning_rate": 5e-07, "token_acc": 0.7017189264297485, "epoch": 0.02564102564102564, "step": 4}, {"loss": 1.3429011106491089, "grad_norm": 532.4883519493468, "learning_rate": 6.249999999999999e-07, "token_acc": 0.7035235166549683, "epoch": 0.03205128205128205, "step": 5}, {"loss": 1.3432822227478027, "grad_norm": 745.1533569281419, "learning_rate": 7.5e-07, "token_acc": 0.704049289226532, "epoch": 0.038461538461538464, "step": 6}, {"loss": 1.2372357845306396, "grad_norm": 316.9597378880501, "learning_rate": 8.75e-07, "token_acc": 0.719425618648529, "epoch": 0.04487179487179487, "step": 7}, {"loss": 1.1821542978286743, "grad_norm": 80.89470277766097, "learning_rate": 1e-06, "token_acc": 0.7098754644393921, "epoch": 0.05128205128205128, "step": 8}, {"loss": 1.19064462184906, "grad_norm": 194.43529650602088, "learning_rate": 9.998873580873846e-07, "token_acc": 0.706679105758667, "epoch": 0.057692307692307696, "step": 9}, {"loss": 1.1448092460632324, "grad_norm": 99.3947168866689, "learning_rate": 9.995494831023408e-07, "token_acc": 0.7132793068885803, "epoch": 0.0641025641025641, "step": 10}, {"loss": 1.0852806568145752, "grad_norm": 84.23475207475708, "learning_rate": 9.989865272804063e-07, "token_acc": 0.7265625, "epoch": 0.07051282051282051, "step": 11}, {"loss": 1.032708764076233, "grad_norm": 85.35629370266734, "learning_rate": 9.98198744271263e-07, "token_acc": 0.7359824776649475, "epoch": 0.07692307692307693, "step": 12}, {"loss": 1.0187692642211914, "grad_norm": 29.61142573444334, "learning_rate": 9.971864890244513e-07, "token_acc": 0.7330501675605774, "epoch": 0.08333333333333333, "step": 13}, {"loss": 1.0328643321990967, "grad_norm": 53.80220392943845, "learning_rate": 9.959502176294382e-07, "token_acc": 0.733631432056427, "epoch": 0.08974358974358974, "step": 14}, {"loss": 0.9353116750717163, "grad_norm": 26.63791401902767, "learning_rate": 9.944904871101226e-07, "token_acc": 0.7549552917480469, "epoch": 0.09615384615384616, "step": 15}, {"loss": 0.8937969207763672, "grad_norm": 241.8763036480847, "learning_rate": 9.928079551738541e-07, "token_acc": 0.7634074687957764, "epoch": 0.10256410256410256, "step": 16}, {"loss": 0.9060311317443848, "grad_norm": 260.0451355494914, "learning_rate": 9.909033799150946e-07, "token_acc": 0.7678724527359009, "epoch": 0.10897435897435898, "step": 17}, {"loss": 0.8835141658782959, "grad_norm": 13.610113077700959, "learning_rate": 9.887776194738431e-07, "token_acc": 0.7689903378486633, "epoch": 0.11538461538461539, "step": 18}, {"loss": 0.8617175817489624, "grad_norm": 13.22296824616019, "learning_rate": 9.864316316489872e-07, "token_acc": 0.7724306583404541, "epoch": 0.12179487179487179, "step": 19}, {"loss": 0.834773063659668, "grad_norm": 10.539571124679378, "learning_rate": 9.838664734667495e-07, "token_acc": 0.7823472023010254, "epoch": 0.1282051282051282, "step": 20}, {"loss": 0.8732649087905884, "grad_norm": 10.157244927886453, "learning_rate": 9.810833007044246e-07, "token_acc": 0.7687795162200928, "epoch": 0.1346153846153846, "step": 21}, {"loss": 0.8539620041847229, "grad_norm": 8.895664285984349, "learning_rate": 9.780833673696254e-07, "token_acc": 0.7716691493988037, "epoch": 0.14102564102564102, "step": 22}, {"loss": 0.8183756470680237, "grad_norm": 8.046583911365952, "learning_rate": 9.748680251352658e-07, "token_acc": 0.777586817741394, "epoch": 0.14743589743589744, "step": 23}, {"loss": 0.7400453090667725, "grad_norm": 6.409869446484142, "learning_rate": 9.71438722730542e-07, "token_acc": 0.7995849847793579, "epoch": 0.15384615384615385, "step": 24}, {"loss": 0.7597711682319641, "grad_norm": 6.146343328005812, "learning_rate": 9.67797005288181e-07, "token_acc": 0.7876543402671814, "epoch": 0.16025641025641027, "step": 25}, {"loss": 0.7657437324523926, "grad_norm": 6.460835549120558, "learning_rate": 9.639445136482546e-07, "token_acc": 0.7866899967193604, "epoch": 0.16666666666666666, "step": 26}, {"loss": 0.7088508009910583, "grad_norm": 5.989613299220979, "learning_rate": 9.598829836188693e-07, "token_acc": 0.8035452961921692, "epoch": 0.17307692307692307, "step": 27}, {"loss": 0.6949191093444824, "grad_norm": 6.862877869409243, "learning_rate": 9.556142451940679e-07, "token_acc": 0.8057082295417786, "epoch": 0.1794871794871795, "step": 28}, {"loss": 0.6835302114486694, "grad_norm": 5.454999319627048, "learning_rate": 9.511402217292925e-07, "token_acc": 0.8084285259246826, "epoch": 0.1858974358974359, "step": 29}, {"loss": 0.6599457263946533, "grad_norm": 5.4625192461823975, "learning_rate": 9.464629290747842e-07, "token_acc": 0.8131645321846008, "epoch": 0.19230769230769232, "step": 30}, {"loss": 0.6833065748214722, "grad_norm": 5.908943864151192, "learning_rate": 9.415844746673046e-07, "token_acc": 0.8138014078140259, "epoch": 0.1987179487179487, "step": 31}, {"loss": 0.6861380338668823, "grad_norm": 5.288567441191335, "learning_rate": 9.36507056580594e-07, "token_acc": 0.8055645823478699, "epoch": 0.20512820512820512, "step": 32}, {"loss": 0.633919894695282, "grad_norm": 5.357065287460616, "learning_rate": 9.312329625349901e-07, "token_acc": 0.818831741809845, "epoch": 0.21153846153846154, "step": 33}, {"loss": 0.6426697969436646, "grad_norm": 5.160602546431693, "learning_rate": 9.257645688666555e-07, "token_acc": 0.817148745059967, "epoch": 0.21794871794871795, "step": 34}, {"loss": 0.6970824003219604, "grad_norm": 4.864703496205445, "learning_rate": 9.201043394568771e-07, "token_acc": 0.8054612874984741, "epoch": 0.22435897435897437, "step": 35}, {"loss": 0.6301164627075195, "grad_norm": 5.209842274657687, "learning_rate": 9.14254824621921e-07, "token_acc": 0.8198198080062866, "epoch": 0.23076923076923078, "step": 36}, {"loss": 0.6474230885505676, "grad_norm": 5.060855111856781, "learning_rate": 9.082186599639427e-07, "token_acc": 0.8159399032592773, "epoch": 0.23717948717948717, "step": 37}, {"loss": 0.6971738338470459, "grad_norm": 4.618117753589896, "learning_rate": 9.019985651834703e-07, "token_acc": 0.805202841758728, "epoch": 0.24358974358974358, "step": 38}, {"loss": 0.6257902383804321, "grad_norm": 4.667129721936595, "learning_rate": 8.955973428539942e-07, "token_acc": 0.8184399604797363, "epoch": 0.25, "step": 39}, {"loss": 0.6510574221611023, "grad_norm": 4.758674560079032, "learning_rate": 8.890178771592197e-07, "token_acc": 0.8164493441581726, "epoch": 0.2564102564102564, "step": 40}, {"loss": 0.636375367641449, "grad_norm": 4.886139426025786, "learning_rate": 8.822631325935463e-07, "token_acc": 0.8183852434158325, "epoch": 0.26282051282051283, "step": 41}, {"loss": 0.6564359664916992, "grad_norm": 4.736579679060144, "learning_rate": 8.753361526263621e-07, "token_acc": 0.8196941614151001, "epoch": 0.2692307692307692, "step": 42}, {"loss": 0.638996958732605, "grad_norm": 4.5575813316907805, "learning_rate": 8.68240058330756e-07, "token_acc": 0.8199967741966248, "epoch": 0.27564102564102566, "step": 43}, {"loss": 0.6362872123718262, "grad_norm": 4.994153842899941, "learning_rate": 8.609780469772621e-07, "token_acc": 0.8194926977157593, "epoch": 0.28205128205128205, "step": 44}, {"loss": 0.6394486427307129, "grad_norm": 4.549016612724467, "learning_rate": 8.535533905932737e-07, "token_acc": 0.8174207210540771, "epoch": 0.28846153846153844, "step": 45}, {"loss": 0.6170175075531006, "grad_norm": 4.769737749020403, "learning_rate": 8.459694344887731e-07, "token_acc": 0.8237839937210083, "epoch": 0.2948717948717949, "step": 46}, {"loss": 0.5933050513267517, "grad_norm": 4.680957748858921, "learning_rate": 8.382295957490435e-07, "token_acc": 0.829018235206604, "epoch": 0.30128205128205127, "step": 47}, {"loss": 0.6095594167709351, "grad_norm": 4.52158152824765, "learning_rate": 8.303373616950406e-07, "token_acc": 0.8250677585601807, "epoch": 0.3076923076923077, "step": 48}, {"loss": 0.586349368095398, "grad_norm": 4.8131319269060135, "learning_rate": 8.222962883121195e-07, "token_acc": 0.8342939615249634, "epoch": 0.3141025641025641, "step": 49}, {"loss": 0.6297792196273804, "grad_norm": 4.874040165169469, "learning_rate": 8.141099986478212e-07, "token_acc": 0.8187986016273499, "epoch": 0.32051282051282054, "step": 50}, {"loss": 0.5949305295944214, "grad_norm": 4.942277020634408, "learning_rate": 8.057821811794457e-07, "token_acc": 0.8311688303947449, "epoch": 0.3269230769230769, "step": 51}, {"loss": 0.5739504098892212, "grad_norm": 4.3953959699546505, "learning_rate": 7.973165881521433e-07, "token_acc": 0.8309744596481323, "epoch": 0.3333333333333333, "step": 52}, {"loss": 0.6172730326652527, "grad_norm": 4.735913113697843, "learning_rate": 7.88717033888274e-07, "token_acc": 0.821295976638794, "epoch": 0.33974358974358976, "step": 53}, {"loss": 0.5573772192001343, "grad_norm": 4.378256610554005, "learning_rate": 7.799873930687977e-07, "token_acc": 0.8451364636421204, "epoch": 0.34615384615384615, "step": 54}, {"loss": 0.5717079639434814, "grad_norm": 4.684218984763407, "learning_rate": 7.711315989874676e-07, "token_acc": 0.838281512260437, "epoch": 0.3525641025641026, "step": 55}, {"loss": 0.59979248046875, "grad_norm": 4.4280118932792325, "learning_rate": 7.621536417786158e-07, "token_acc": 0.8215728998184204, "epoch": 0.358974358974359, "step": 56}, {"loss": 0.554166316986084, "grad_norm": 4.826729862989936, "learning_rate": 7.530575666193282e-07, "token_acc": 0.8405694961547852, "epoch": 0.36538461538461536, "step": 57}, {"loss": 0.5708739161491394, "grad_norm": 4.359263862107659, "learning_rate": 7.438474719068173e-07, "token_acc": 0.834990382194519, "epoch": 0.3717948717948718, "step": 58}, {"loss": 0.5425753593444824, "grad_norm": 4.828866436495037, "learning_rate": 7.345275074118185e-07, "token_acc": 0.8373525142669678, "epoch": 0.3782051282051282, "step": 59}, {"loss": 0.5623891353607178, "grad_norm": 4.598613239569117, "learning_rate": 7.251018724088366e-07, "token_acc": 0.8328151702880859, "epoch": 0.38461538461538464, "step": 60}, {"loss": 0.5742952227592468, "grad_norm": 4.840462028626115, "learning_rate": 7.155748137840892e-07, "token_acc": 0.8290673494338989, "epoch": 0.391025641025641, "step": 61}, {"loss": 0.586388885974884, "grad_norm": 4.23328233570671, "learning_rate": 7.059506241219964e-07, "token_acc": 0.8265019059181213, "epoch": 0.3974358974358974, "step": 62}, {"loss": 0.5368714332580566, "grad_norm": 4.393885373170824, "learning_rate": 6.962336397710819e-07, "token_acc": 0.8431768417358398, "epoch": 0.40384615384615385, "step": 63}, {"loss": 0.6058262586593628, "grad_norm": 4.662530184420074, "learning_rate": 6.864282388901543e-07, "token_acc": 0.8214831352233887, "epoch": 0.41025641025641024, "step": 64}, {"loss": 0.5562135577201843, "grad_norm": 4.558811999067238, "learning_rate": 6.765388394756504e-07, "token_acc": 0.8388559222221375, "epoch": 0.4166666666666667, "step": 65}, {"loss": 0.5610250234603882, "grad_norm": 4.765395712458729, "learning_rate": 6.665698973710288e-07, "token_acc": 0.8305501937866211, "epoch": 0.4230769230769231, "step": 66}, {"loss": 0.5418926477432251, "grad_norm": 4.5677446123493635, "learning_rate": 6.565259042591111e-07, "token_acc": 0.8431202173233032, "epoch": 0.42948717948717946, "step": 67}, {"loss": 0.5533748865127563, "grad_norm": 4.529946815527597, "learning_rate": 6.464113856382751e-07, "token_acc": 0.8329079747200012, "epoch": 0.4358974358974359, "step": 68}, {"loss": 0.5477522015571594, "grad_norm": 4.457221273065978, "learning_rate": 6.362308987834115e-07, "token_acc": 0.8378193974494934, "epoch": 0.4423076923076923, "step": 69}, {"loss": 0.5534911751747131, "grad_norm": 4.611531783389826, "learning_rate": 6.259890306925626e-07, "token_acc": 0.841196596622467, "epoch": 0.44871794871794873, "step": 70}, {"loss": 0.5345719456672668, "grad_norm": 4.418661988438471, "learning_rate": 6.156903960201708e-07, "token_acc": 0.8396868705749512, "epoch": 0.4551282051282051, "step": 71}, {"loss": 0.573357343673706, "grad_norm": 4.438614534821985, "learning_rate": 6.053396349978631e-07, "token_acc": 0.8328611850738525, "epoch": 0.46153846153846156, "step": 72}, {"loss": 0.548120379447937, "grad_norm": 4.304860173689802, "learning_rate": 5.949414113437141e-07, "token_acc": 0.8377840518951416, "epoch": 0.46794871794871795, "step": 73}, {"loss": 0.554309606552124, "grad_norm": 4.194319397177033, "learning_rate": 5.845004101609246e-07, "token_acc": 0.8362563848495483, "epoch": 0.47435897435897434, "step": 74}, {"loss": 0.5796681642532349, "grad_norm": 4.6298008452210615, "learning_rate": 5.740213358268658e-07, "token_acc": 0.8328510522842407, "epoch": 0.4807692307692308, "step": 75}, {"loss": 0.5438037514686584, "grad_norm": 4.355472281518206, "learning_rate": 5.635089098734393e-07, "token_acc": 0.8434426188468933, "epoch": 0.48717948717948717, "step": 76}, {"loss": 0.5334382653236389, "grad_norm": 4.627740617106088, "learning_rate": 5.52967868859708e-07, "token_acc": 0.8434890508651733, "epoch": 0.4935897435897436, "step": 77}, {"loss": 0.5313788652420044, "grad_norm": 4.2780408616986945, "learning_rate": 5.424029622377546e-07, "token_acc": 0.8449059128761292, "epoch": 0.5, "step": 78}, {"loss": 0.5232336521148682, "grad_norm": 4.092855208873091, "learning_rate": 5.318189502127331e-07, "token_acc": 0.8468860983848572, "epoch": 0.5064102564102564, "step": 79}, {"loss": 0.5497955083847046, "grad_norm": 4.428993590455852, "learning_rate": 5.212206015980741e-07, "token_acc": 0.8446012735366821, "epoch": 0.5128205128205128, "step": 80}, {"loss": 0.546926736831665, "grad_norm": 4.21577617143911, "learning_rate": 5.106126916668118e-07, "token_acc": 0.8366925716400146, "epoch": 0.5192307692307693, "step": 81}, {"loss": 0.5270861387252808, "grad_norm": 4.438862747258348, "learning_rate": 5e-07, "token_acc": 0.8371419310569763, "epoch": 0.5256410256410257, "step": 82}, {"loss": 0.5339565277099609, "grad_norm": 4.176826355296084, "learning_rate": 4.893873083331882e-07, "token_acc": 0.8417102098464966, "epoch": 0.532051282051282, "step": 83}, {"loss": 0.5424758195877075, "grad_norm": 4.530198529065255, "learning_rate": 4.787793984019259e-07, "token_acc": 0.8395857810974121, "epoch": 0.5384615384615384, "step": 84}, {"loss": 0.5200133919715881, "grad_norm": 4.370509876032819, "learning_rate": 4.681810497872668e-07, "token_acc": 0.8433917760848999, "epoch": 0.5448717948717948, "step": 85}, {"loss": 0.5480057597160339, "grad_norm": 4.382905385719549, "learning_rate": 4.5759703776224555e-07, "token_acc": 0.8397650718688965, "epoch": 0.5512820512820513, "step": 86}, {"loss": 0.5228407382965088, "grad_norm": 4.342442518129754, "learning_rate": 4.47032131140292e-07, "token_acc": 0.8539518713951111, "epoch": 0.5576923076923077, "step": 87}, {"loss": 0.5413128137588501, "grad_norm": 4.137341022557355, "learning_rate": 4.364910901265606e-07, "token_acc": 0.8368589878082275, "epoch": 0.5641025641025641, "step": 88}, {"loss": 0.5160300731658936, "grad_norm": 4.369749651287342, "learning_rate": 4.259786641731343e-07, "token_acc": 0.8478597402572632, "epoch": 0.5705128205128205, "step": 89}, {"loss": 0.5385915637016296, "grad_norm": 4.5684142339974585, "learning_rate": 4.154995898390755e-07, "token_acc": 0.8457493185997009, "epoch": 0.5769230769230769, "step": 90}, {"loss": 0.5259469747543335, "grad_norm": 4.263077787933313, "learning_rate": 4.0505858865628575e-07, "token_acc": 0.8433637022972107, "epoch": 0.5833333333333334, "step": 91}, {"loss": 0.5241655111312866, "grad_norm": 4.246658460460572, "learning_rate": 3.94660365002137e-07, "token_acc": 0.8478260636329651, "epoch": 0.5897435897435898, "step": 92}, {"loss": 0.5464353561401367, "grad_norm": 4.4264562426283325, "learning_rate": 3.8430960397982926e-07, "token_acc": 0.8348221778869629, "epoch": 0.5961538461538461, "step": 93}, {"loss": 0.5346203446388245, "grad_norm": 4.571352573379081, "learning_rate": 3.7401096930743746e-07, "token_acc": 0.8426615595817566, "epoch": 0.6025641025641025, "step": 94}, {"loss": 0.4898928999900818, "grad_norm": 4.302883692806041, "learning_rate": 3.637691012165886e-07, "token_acc": 0.8545727133750916, "epoch": 0.6089743589743589, "step": 95}, {"loss": 0.5597435235977173, "grad_norm": 4.488514668810176, "learning_rate": 3.535886143617248e-07, "token_acc": 0.8423628211021423, "epoch": 0.6153846153846154, "step": 96}, {"loss": 0.5908330082893372, "grad_norm": 4.5047373126179036, "learning_rate": 3.4347409574088894e-07, "token_acc": 0.831405758857727, "epoch": 0.6217948717948718, "step": 97}, {"loss": 0.5422452092170715, "grad_norm": 4.326989989523886, "learning_rate": 3.334301026289712e-07, "token_acc": 0.8455585837364197, "epoch": 0.6282051282051282, "step": 98}, {"loss": 0.4947037994861603, "grad_norm": 4.071817598010613, "learning_rate": 3.234611605243496e-07, "token_acc": 0.8524978756904602, "epoch": 0.6346153846153846, "step": 99}, {"loss": 0.4820348620414734, "grad_norm": 4.364292580374463, "learning_rate": 3.135717611098457e-07, "token_acc": 0.8573145866394043, "epoch": 0.6410256410256411, "step": 100}, {"loss": 0.577752411365509, "grad_norm": 4.245101360198055, "learning_rate": 3.037663602289181e-07, "token_acc": 0.8319244384765625, "epoch": 0.6474358974358975, "step": 101}, {"loss": 0.5117242336273193, "grad_norm": 4.3455778338843, "learning_rate": 2.940493758780037e-07, "token_acc": 0.8504513502120972, "epoch": 0.6538461538461539, "step": 102}, {"loss": 0.5905250310897827, "grad_norm": 4.354835545935755, "learning_rate": 2.8442518621591084e-07, "token_acc": 0.829204261302948, "epoch": 0.6602564102564102, "step": 103}, {"loss": 0.5381240248680115, "grad_norm": 4.373092445004486, "learning_rate": 2.748981275911633e-07, "token_acc": 0.83758544921875, "epoch": 0.6666666666666666, "step": 104}, {"loss": 0.5173760056495667, "grad_norm": 4.158187745451947, "learning_rate": 2.6547249258818163e-07, "token_acc": 0.8419111371040344, "epoch": 0.6730769230769231, "step": 105}, {"loss": 0.5291812419891357, "grad_norm": 4.360270954918175, "learning_rate": 2.561525280931828e-07, "token_acc": 0.8481848239898682, "epoch": 0.6794871794871795, "step": 106}, {"loss": 0.5495140552520752, "grad_norm": 4.394654818317897, "learning_rate": 2.469424333806718e-07, "token_acc": 0.8393661379814148, "epoch": 0.6858974358974359, "step": 107}, {"loss": 0.528519868850708, "grad_norm": 4.4823775203281, "learning_rate": 2.378463582213842e-07, "token_acc": 0.8453878164291382, "epoch": 0.6923076923076923, "step": 108}, {"loss": 0.4981050491333008, "grad_norm": 4.219004456899406, "learning_rate": 2.2886840101253247e-07, "token_acc": 0.8480120897293091, "epoch": 0.6987179487179487, "step": 109}, {"loss": 0.5449967980384827, "grad_norm": 4.529812908764033, "learning_rate": 2.2001260693120232e-07, "token_acc": 0.838426411151886, "epoch": 0.7051282051282052, "step": 110}, {"loss": 0.5300467610359192, "grad_norm": 4.264681112622175, "learning_rate": 2.112829661117259e-07, "token_acc": 0.8408368229866028, "epoch": 0.7115384615384616, "step": 111}, {"loss": 0.5546152591705322, "grad_norm": 4.224432096547358, "learning_rate": 2.026834118478567e-07, "token_acc": 0.8335220813751221, "epoch": 0.717948717948718, "step": 112}, {"loss": 0.5911360383033752, "grad_norm": 4.40275179389628, "learning_rate": 1.9421781882055443e-07, "token_acc": 0.8269292712211609, "epoch": 0.7243589743589743, "step": 113}, {"loss": 0.5440055131912231, "grad_norm": 4.438604539947117, "learning_rate": 1.858900013521788e-07, "token_acc": 0.8439837098121643, "epoch": 0.7307692307692307, "step": 114}, {"loss": 0.5248892307281494, "grad_norm": 4.3373601365266605, "learning_rate": 1.777037116878804e-07, "token_acc": 0.8428595662117004, "epoch": 0.7371794871794872, "step": 115}, {"loss": 0.5044262409210205, "grad_norm": 3.9787786664284495, "learning_rate": 1.6966263830495935e-07, "token_acc": 0.8520953059196472, "epoch": 0.7435897435897436, "step": 116}, {"loss": 0.5238938331604004, "grad_norm": 4.206466011537137, "learning_rate": 1.6177040425095663e-07, "token_acc": 0.8503661751747131, "epoch": 0.75, "step": 117}, {"loss": 0.4803212583065033, "grad_norm": 4.11543543662316, "learning_rate": 1.5403056551122694e-07, "token_acc": 0.861027181148529, "epoch": 0.7564102564102564, "step": 118}, {"loss": 0.5092470645904541, "grad_norm": 4.307365738144286, "learning_rate": 1.4644660940672627e-07, "token_acc": 0.847130537033081, "epoch": 0.7628205128205128, "step": 119}, {"loss": 0.5285454988479614, "grad_norm": 4.228969596612662, "learning_rate": 1.3902195302273778e-07, "token_acc": 0.8402243852615356, "epoch": 0.7692307692307693, "step": 120}, {"loss": 0.5160696506500244, "grad_norm": 4.317437082837098, "learning_rate": 1.3175994166924392e-07, "token_acc": 0.843143880367279, "epoch": 0.7756410256410257, "step": 121}, {"loss": 0.5897269248962402, "grad_norm": 4.206230433373526, "learning_rate": 1.2466384737363779e-07, "token_acc": 0.8222737908363342, "epoch": 0.782051282051282, "step": 122}, {"loss": 0.5243645906448364, "grad_norm": 4.537270673759693, "learning_rate": 1.1773686740645383e-07, "token_acc": 0.8434827327728271, "epoch": 0.7884615384615384, "step": 123}, {"loss": 0.5309593677520752, "grad_norm": 4.338972097867842, "learning_rate": 1.1098212284078035e-07, "token_acc": 0.8410027623176575, "epoch": 0.7948717948717948, "step": 124}, {"loss": 0.5152863264083862, "grad_norm": 4.172268942442711, "learning_rate": 1.0440265714600571e-07, "token_acc": 0.8451645970344543, "epoch": 0.8012820512820513, "step": 125}, {"loss": 0.5396726131439209, "grad_norm": 4.250149736675509, "learning_rate": 9.800143481652979e-08, "token_acc": 0.8436694741249084, "epoch": 0.8076923076923077, "step": 126}, {"loss": 0.5297789573669434, "grad_norm": 4.405607345694319, "learning_rate": 9.17813400360572e-08, "token_acc": 0.8374651074409485, "epoch": 0.8141025641025641, "step": 127}, {"loss": 0.4773644208908081, "grad_norm": 4.256107645791666, "learning_rate": 8.574517537807896e-08, "token_acc": 0.8574733138084412, "epoch": 0.8205128205128205, "step": 128}, {"loss": 0.5019025802612305, "grad_norm": 4.402345485095196, "learning_rate": 7.989566054312286e-08, "token_acc": 0.8487906455993652, "epoch": 0.8269230769230769, "step": 129}, {"loss": 0.509482741355896, "grad_norm": 4.222991640512232, "learning_rate": 7.423543113334435e-08, "token_acc": 0.8477755784988403, "epoch": 0.8333333333333334, "step": 130}, {"loss": 0.5533698201179504, "grad_norm": 4.331972240710486, "learning_rate": 6.876703746500984e-08, "token_acc": 0.8373857736587524, "epoch": 0.8397435897435898, "step": 131}, {"loss": 0.5067212581634521, "grad_norm": 4.085679298862501, "learning_rate": 6.349294341940592e-08, "token_acc": 0.8518636226654053, "epoch": 0.8461538461538461, "step": 132}, {"loss": 0.49411821365356445, "grad_norm": 4.273453737642771, "learning_rate": 5.8415525332695334e-08, "token_acc": 0.8478958606719971, "epoch": 0.8525641025641025, "step": 133}, {"loss": 0.5529794692993164, "grad_norm": 4.262340473425649, "learning_rate": 5.353707092521581e-08, "token_acc": 0.8371466994285583, "epoch": 0.8589743589743589, "step": 134}, {"loss": 0.509425699710846, "grad_norm": 4.382139460926817, "learning_rate": 4.885977827070747e-08, "token_acc": 0.8512613773345947, "epoch": 0.8653846153846154, "step": 135}, {"loss": 0.5211268067359924, "grad_norm": 4.62171216340832, "learning_rate": 4.4385754805932095e-08, "token_acc": 0.84375, "epoch": 0.8717948717948718, "step": 136}, {"loss": 0.49088388681411743, "grad_norm": 4.204088741197678, "learning_rate": 4.0117016381130634e-08, "token_acc": 0.8564977645874023, "epoch": 0.8782051282051282, "step": 137}, {"loss": 0.5041382312774658, "grad_norm": 4.377681174151091, "learning_rate": 3.6055486351745324e-08, "token_acc": 0.8541952967643738, "epoch": 0.8846153846153846, "step": 138}, {"loss": 0.5480432510375977, "grad_norm": 4.32917767077044, "learning_rate": 3.220299471181898e-08, "token_acc": 0.8370358347892761, "epoch": 0.8910256410256411, "step": 139}, {"loss": 0.5408469438552856, "grad_norm": 4.190278678307778, "learning_rate": 2.8561277269457895e-08, "token_acc": 0.8408304452896118, "epoch": 0.8974358974358975, "step": 140}, {"loss": 0.5250701904296875, "grad_norm": 4.429799923194152, "learning_rate": 2.513197486473406e-08, "token_acc": 0.8404897451400757, "epoch": 0.9038461538461539, "step": 141}, {"loss": 0.49129876494407654, "grad_norm": 4.240556447333352, "learning_rate": 2.1916632630374577e-08, "token_acc": 0.8528439402580261, "epoch": 0.9102564102564102, "step": 142}, {"loss": 0.5107077360153198, "grad_norm": 4.094432099971454, "learning_rate": 1.8916699295575323e-08, "token_acc": 0.8480400443077087, "epoch": 0.9166666666666666, "step": 143}, {"loss": 0.48216086626052856, "grad_norm": 4.096105481468722, "learning_rate": 1.6133526533250563e-08, "token_acc": 0.8517335057258606, "epoch": 0.9230769230769231, "step": 144}, {"loss": 0.4809270203113556, "grad_norm": 3.999889432088483, "learning_rate": 1.3568368351012716e-08, "token_acc": 0.8589807748794556, "epoch": 0.9294871794871795, "step": 145}, {"loss": 0.4755428731441498, "grad_norm": 4.312742288765351, "learning_rate": 1.1222380526156927e-08, "token_acc": 0.8620191216468811, "epoch": 0.9358974358974359, "step": 146}, {"loss": 0.5387917757034302, "grad_norm": 4.535728213741053, "learning_rate": 9.096620084905471e-09, "token_acc": 0.8353335857391357, "epoch": 0.9423076923076923, "step": 147}, {"loss": 0.4899570345878601, "grad_norm": 4.1679468407507905, "learning_rate": 7.1920448261457715e-09, "token_acc": 0.8593058586120605, "epoch": 0.9487179487179487, "step": 148}, {"loss": 0.5454012155532837, "grad_norm": 4.2351778138083604, "learning_rate": 5.509512889877332e-09, "token_acc": 0.8417900800704956, "epoch": 0.9551282051282052, "step": 149}, {"loss": 0.523182213306427, "grad_norm": 4.296677102692333, "learning_rate": 4.049782370561583e-09, "token_acc": 0.8438036441802979, "epoch": 0.9615384615384616, "step": 150}, {"loss": 0.5141420364379883, "grad_norm": 4.323468547344599, "learning_rate": 2.813510975548772e-09, "token_acc": 0.8459829092025757, "epoch": 0.967948717948718, "step": 151}, {"loss": 0.5381837487220764, "grad_norm": 4.286188181441807, "learning_rate": 1.8012557287367391e-09, "token_acc": 0.8385339379310608, "epoch": 0.9743589743589743, "step": 152}, {"loss": 0.5164992809295654, "grad_norm": 4.395230614166292, "learning_rate": 1.013472719593733e-09, "token_acc": 0.848739504814148, "epoch": 0.9807692307692307, "step": 153}, {"loss": 0.5406173467636108, "grad_norm": 4.382167968792246, "learning_rate": 4.5051689765929213e-10, "token_acc": 0.8395528793334961, "epoch": 0.9871794871794872, "step": 154}, {"loss": 0.5075889229774475, "grad_norm": 4.218184305013247, "learning_rate": 1.1264191261528555e-10, "token_acc": 0.8444151282310486, "epoch": 0.9935897435897436, "step": 155}, {"loss": 0.5462541580200195, "grad_norm": 4.958298958687034, "learning_rate": 0.0, "token_acc": 0.8386925458908081, "epoch": 1.0, "step": 156}, {"eval_loss": 0.5514746904373169, "eval_runtime": 10.0275, "eval_samples_per_second": 0.898, "eval_steps_per_second": 0.199, "eval_token_acc": 0.8481510877609253, "epoch": 1.0, "step": 156}, {"eval_loss": 0.5514746904373169, "eval_runtime": 12.1526, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.165, "eval_token_acc": 0.8481510877609253, "epoch": 1.0, "step": 156}, {"train_runtime": 13045.8923, "train_samples_per_second": 0.763, "train_steps_per_second": 0.012, "total_flos": 69981770547200.0, "train_loss": 0.6374106772052937, "epoch": 1.0, "step": 156}], "memory": 92.6015625}

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/runs/events.out.tfevents.1755695667.TENCENT64.site.2602616.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f775150e89c38ae07379b75d60c0df41cb1151e270a0dbe6db976e3d848e7eca
+size 49552

qwen2.5-7b-base-1500q-1500q-1e_6-old-click-1ep/val_dataset.jsonl ADDED Viewed

	@@ -0,0 +1,9 @@

+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which of the following is known for the song Sister Havana: Urge Overkill or Ministry?\nObservation: [1947] RootWebArea 'Search: Sister Havana Urge Overkill' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=Sister+Havana+Urge+Overkill\n\t[1968] textbox \"Search 'Wikipedia'\" required: False\n\t[1974] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1975] button '🏠'\n\t[1976] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1977] button 'Wikipedia'\n\t[1978] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1979] button '🎲'\n\t[1839] StaticText 'Results 1-20 of 20 for \"Sister Havana Urge Overkill\"'\n\t[1987] link 'Saturation (Urge Overkill album)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Saturation_(Urge_Overkill_album)\n\t[1846] StaticText '...\"Sister Havana\" and \"Positive Bleeding\" as singles in the US and Europe. \"Sister Havana\" charted highly on both the modern rock and mainstream rock charts, peaking at numbers 6 and 10, respectively, while \"Positive Bleeding\" became a minor rock radio hit. Saturation Studio album by Urge Overkill Released June 8, 1993 Recorded December 17, 1992–January 9, 1993 Genre Alternative rock Length 70:01 Label Geffen Producer Butcher Bros., Andy Kravitz Urge Overkill chronology Stull (1992) Saturation......'\n\t[1855] StaticText 'from Wikipedia'\n\t[1856] StaticText '635 words'\n\t[1996] link 'Urge Overkill' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Urge_Overkill\n\t[1858] StaticText '...\"Sister Havana\" and their cover of Neil Diamond\\'s \"Girl, You\\'ll Be a Woman Soon\", which was notably used in Quentin Tarantino\\'s Pulp Fiction. Their latest album, Oui, was released in 2022. Urge Overkill Background information Origin Chicago, Illinois, United States Genres Alternative rock, hard rock Years active 1986–1997, 2004–present Labels Touch and Go, Geffen Website www.urgeoverkill.com Members Nash Kato Eddie \"King\" Roeser Nate Arling Adam Arling Past members Johnny \"Blackie Onassis\"......'\n\t[1863] StaticText 'from Wikipedia'\n\t[1864] StaticText '1,430 words'\n\t[2003] link 'Triple J Hottest 100, 1993' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Triple_J_Hottest_100,_1993\n\t[1872] StaticText '1,786 words'\n\t[2010] link 'Hit Machine' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Hit_Machine\n\t[1874] StaticText '...\"Sister Havana\" (3:43) Peter Andre – \"Let\\'s Get It On\" (4:09) Leftfield featuring John Lydon – \"Open Up\" (3:46) Xscape – \"Just Kickin\\' It\" (3:25) Ace of Base – \"Happy Nation\" (3:28) Defryme – \"Pure Killer\" (4:04) Funky Poets – \"Born in the Ghetto\" (4:18) Certifications Region Certification Certified units/sales Australia (ARIA)[7] Platinum 70,000^ ^ Shipments figures based on certification alone. Hit Machine 5 Hit Machine 5 Compilation album by Various artists Released 1994 Genre......'\n\t[1879] StaticText 'from Wikipedia'\n\t[1880] StaticText '5,688 words'\n\t[2017] link 'Reading and Leeds Festivals line-ups' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Reading_and_Leeds_Festivals_line-ups\n\t[1886] StaticText '16,832 words'\n\t[2023] link 'Glastonbury Festival line-ups' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Glastonbury_Festival_line-ups\nHISTORY_ACTION: type [328] [Sister Havana] [1]\ntype [1335] [Sister Havana Urge Overkill] [1]\n"}, {"role": "assistant", "content": "<think>The search results and snippets indicate that \"Sister Havana\" is a song by Urge Overkill. The text mentions \"Sister Havana\" as a single by Urge Overkill, and there is no mention of Ministry in connection with the song. I have enough information to answer the question.</think>\n```stop [Urge Overkill is known for the song \"Sister Havana\".]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: what class of food does egg belong to\nObservation: [1481] RootWebArea 'Search: egg' focused: True url: http://localhost:22015/search?content=wikipedia_en_all_maxi_2022-05&pattern=egg\n\t[1502] textbox \"Search 'Wikipedia'\" required: False\n\t[1508] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[1509] button '🏠'\n\t[1510] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[1511] button 'Wikipedia'\n\t[1512] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[1513] button '🎲'\n\t[1033] StaticText 'Results 1-25 of 78,533 for \"egg\"'\n\t[1521] link 'List of egg topics' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_egg_topics\n\t[1040] StaticText '...Egg dishes and List of egg dishes A to Z egg dishes Balut (egg) Bai pong moan Boiled egg Brik Century egg Changua Chinese steamed eggs Çılbır Coddled egg Custard Deviled egg Devonshire eggs Egg bhurji Egg drop soup Egg foo young Egg in the basket Egg khagina Egg salad Egg sandwich Eggs Benedict Eggs Neptune Eggs Sardou Eggs and brains French toast Fried egg Fritaja Frittata Haminados Hangtown fry Huevos divorciados[lower-alpha 1] Huevos motuleños[lower-alpha 1] Huevos rancheros[lower-alpha 1]......'\n\t[1073] StaticText 'from Wikipedia'\n\t[1074] StaticText '907 words'\n\t[1542] link 'Boiled egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Boiled_egg\n\t[1076] StaticText '...egg white and egg yolk both solidify, while soft-boiled eggs may leave the yolk, and sometimes the white, at least partially liquid and raw. Boiled eggs are a popular breakfast food around the world. For eggs cooked without their shells in hot water, see poached egg. For other egg dishes, see list of egg dishes. Boiled egg A soft-boiled egg served in the half shell Main ingredients Eggs (typically chicken) Variations Baked eggs, starting temperature, preparation Food energy (per 100\\xa0g serving)......'\n\t[1091] StaticText 'from Wikipedia'\n\t[1092] StaticText '2,185 words'\n\t[1554] link 'Egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Egg\n\t[1094] StaticText 'Egg An egg is the organic vessel containing the zygote in which an embryo develops until it can survive on its own, at which point the animal hatches. An egg results from fertilization of an egg cell. Most arthropods, vertebrates (excluding live-bearing mammals), and mollusks lay eggs, although some, such as scorpions, do not. This article is about biological eggs. For eggs as food, see Egg as food. For other uses, see Egg (disambiguation). Eggs of various birds, a reptile, various cartilaginous......'\n\t[1106] StaticText 'from Wikipedia'\n\t[1107] StaticText '5,196 words'\n\t[1565] link 'Easter egg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Easter_egg\n\t[1109] StaticText '...egg (media). Decorated Easter eggs Easter egg of the Ukrainian variety with the Paschal greeting \"Christ is Risen!\" A chocolate Easter egg Although eggs, in general, were a traditional symbol of fertility and rebirth,[2] in Christianity, for the celebration of Eastertide, Easter eggs symbolize the empty tomb of Jesus, from which Jesus was resurrected.[3][4][5] In addition, one ancient tradition was the staining of Easter eggs with the colour red \"in memory of the blood of Christ, shed as at that......'\n\t[1116] StaticText 'from Wikipedia'\n\t[1117] StaticText '6,153 words'\n\t[1573] link 'Egg as food' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Egg_as_food\n\t[1119] StaticText '...(egg white), and vitellus (egg yolk), contained within various thin membranes. The most commonly consumed eggs are chicken eggs. Other poultry eggs including those of duck and quail also are eaten. Fish eggs are called roe and caviar. \"Chicken egg\" redirects here. For the causality dilemma, see Chicken or the egg. For Egg (biology), see Egg. For other uses, see Egg (disambiguation). Fried eggs with carrot, Parmesan and cream Egg yolks and whole eggs store significant amounts of protein and......'\n\t[1136] StaticText 'from Wikipedia'\n\t[1137] StaticText '9,695 words'\n\t[1586] link 'List of egg dishes' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/List_of_egg_dishes\n\t[1139] StaticText '...(egg yolk), contained within many different thin membranes all surrounded by a protective eggshell. Popular choices for egg consumption are chicken, duck, quail, roe, caviar, and emu, but the '\nHISTORY_ACTION: type [330] [egg] [1]\n"}, {"role": "assistant", "content": "<think>The link titled 'Egg as food' is likely to contain information about the class of food to which eggs belong. Clicking this link should provide the relevant details.</think>\n```click [1573] [Egg as food]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Lawrence D. Cohen's Carrie:The Musical is an adaptation of Stephen King's novel, which centers around a socially awkward teenage girl with what newly discovered powers?\nObservation: [2717] RootWebArea 'Carrie (musical)' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)\n\t[2829] textbox \"Search 'Wikipedia'\" required: False\n\t[2835] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[2836] button '🏠'\n\t[2837] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[2838] button 'Wikipedia'\n\t[2839] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[2840] button '🎲'\n\t[2847] heading 'Carrie (musical)' hasPopup: menu\n\t[1789] StaticText 'Carrie is a '\n\t[2719] link 'musical' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Musical_theatre\n\t[1792] StaticText ' with a book by '\n\t[2854] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t[1794] StaticText ', lyrics by '\n\t[2855] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t[1796] StaticText ', and music by '\n\t[2856] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t[1798] StaticText '. Adapted from '\n\t[2857] link 'Stephen King' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stephen_King\n\t[2859] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(novel)\n\t[1802] StaticText ', it focuses on an awkward teenage girl with '\n\t[2860] link 'telekinetic powers' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Psychokinesis\n\t[1804] StaticText ' whose lonely life is dominated by an oppressive '\n\t[2861] link 'religious fanatic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Religious_fanaticism\n\t[1806] StaticText ' mother. When she is humiliated by her classmates at the high school '\n\t[2862] link 'prom' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Prom\n\t[1808] StaticText ', she unleashes chaos on everyone and everything in her path.'\n\t[2864] table ''\n\t\t[2866] row ''\n\t\t\t[2867] columnheader 'Carrie' required: False\n\t\t[2870] row ''\n\t\t\t[2871] cell 'Official poster for the original Stratford-upon-Avon production'\n\t\t\t\t[2768] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Carrie_the_Musical_Poster.jpg.webp\n\t\t[2874] row ''\n\t\t\t[2875] rowheader 'Music' required: False\n\t\t\t[2876] cell 'Michael Gore'\n\t\t\t\t[2877] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t\t[2878] row ''\n\t\t\t[2879] rowheader 'Lyrics' required: False\n\t\t\t[2880] cell 'Dean Pitchford'\n\t\t\t\t[2881] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t\t[2882] row ''\n\t\t\t[2883] rowheader 'Book' required: False\n\t\t\t\t[2884] link 'Book' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Book_(musical_theatre)\n\t\t\t[2885] cell 'Lawrence D. Cohen'\n\t\t\t\t[2886] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t\t[2887] row ''\n\t\t\t[2888] rowheader 'Basis' required: False\n\t\t\t[2889] cell 'Carrie by Stephen King'\n\t\t\t\t[2890] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(novel)\n\t[1830] StaticText 'Originally premiering in the U.K. in 1988, Carrie opened on '\n\t[2899] link 'Broadway' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Broadway_theatre\n\t[1834] StaticText ' at the Virginia Theatre the same year, but closed after 16 previews and five regular performances. Due to the passionate response from both its critics and its fans, the show is considered one of the most notable failures in '\n\t[2900] link 'Broadway theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Broadway_theatre\n\t[1836] StaticText ' history; a 1991 book written by '\n\t[2901] link 'Ken Mandelbaum' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ken_Mandelbaum\n\t[1838] StaticText ', which chronicled the history of flop Broadway musicals, was partially entitled Not Since Carrie, and a 2021 podcast, Out for Blood, documented its creation and development.'\n\t[2904] group ''\n\t\t[2909] DisclosureTriangle 'Production history' expanded: True\n\t\t\t[2910] heading 'Production history'\n\t\t[1845] StaticText 'Inspired by a 1981 performance of '\n\t\t[2912] link 'Alban Berg' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alban_Berg\n\t\t[2914] link 'Lulu' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lulu_(opera)\n\t\t[1849] StaticText ' at the '\n\t\t[2915] link 'Metropolitan Opera House' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Metropolitan_Opera_House_(Lincoln_Center)\n\t\t[2917] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)#cite_note-DVD_featurette-1\n\t\t[2919] link 'Lawrence D. Cohen' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lawrence_D._Cohen\n\t\t[1854] StaticText ', who wrote the script for the 1976 film version of '\n\t\t[2920] link 'Carrie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(1976_film)\n\t\t[1856] StaticText ', and '\n\t\t[2922] link 'Michael Gore' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Michael_Gore\n\t\t[1858] StaticText ' began work on a musical based on the '\n\t\t[2923] link 'Stephen King' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Stephen_King\n\t\t[2925] link 'Fame' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fame_(1980_film)\n\t\t[1862] StaticText ' collaborator, '\n\t\t[2926] link 'Dean Pitchford' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dean_Pitchford\n\t\t[1864] StaticText ', was brought in to work on the project, which underwent numerous rewrites.'\n\t\t[2928] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Carrie_(musical)#cite_note-2\n\t\t[1865] StaticText ' In August 1984, a '\n\t\t[2930] link 'workshop' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Workshop\n\t\t[1867] StaticText ' of the first act was staged at 890 Broadway in New York City, with '\n\t\t[2931] link 'Annie Golden' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Annie_Golden\n\t\t[1869] StaticText ' as Carrie, '\n\t\t[2932] link 'Maureen McGovern' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Maureen_McGovern\n\t\t[1871] StaticText ' as Mrs. White, '\n\t\t[2933] link 'Laurie Beechman' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Laurie_Beechman\n\t\t[1873] StaticText ' as Mrs. Gardner, and '\n\t\t[2934] link 'Liz Callaway' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Liz_Callaway\n\t\t[1875] StaticText ' as Chris. It was soon announced that Carrie would be produced on '\nHISTORY_ACTION: type [327] [Carrie (musical)] [1]\nclick [1309] [Carrie (musical)]\n"}, {"role": "assistant", "content": "<think>The objective is to identify what newly discovered powers the main character, Carrie, has in Lawrence D. Cohen's adaptation. The text in the accessibility tree states: \"it focuses on an awkward teenage girl with telekinetic powers whose lonely life is dominated by an oppressive religious fanatic mother.\" This directly answers the question.</think>\n```stop [telekinetic powers]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Name the second world war trilogy written by the English writer, Evelyn Waugh, who is also known for his travel writings in the 1930s and a later travel book  called 'A Tourist in Africa' which appeared in 1960?\nObservation: [292] RootWebArea 'User:The other Kiwix guy/Landing' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing\n\t[329] textbox \"Search 'Wikipedia'\" required: False\n\t[335] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[336] button '🏠'\n\t[337] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[338] button 'Wikipedia'\n\t[339] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[340] button '🎲'\n\t[3] StaticText 'Welcome to '\n\t[351] link 'Wikipedia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wikipedia\n\t[5] StaticText 'The free encyclopedia.'\n\t[6] StaticText '6,489,052'\n\t[7] StaticText ' articles in '\n\t[357] link 'English' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/English_Wikipedia\n\t[359] heading 'Arts'\n\t[361] link 'Architecture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Architecture\n\t[362] link 'Books' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Books\n\t[363] link 'Cinematography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cinematography\n\t[364] link 'Dance' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dance\n\t[365] link 'Design' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Design\n\t[366] link 'Fashion' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fashion\n\t[367] link 'Films' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Films\n\t[368] link 'Gastronomy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gastronomy\n\t[369] link 'Literature' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Literature\n\t[370] link 'Magic (illusion)' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Magic_(illusion)\n\t[371] link 'Music' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Music\n\t[372] link 'Painting' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Painting\n\t[373] link 'Photography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Photography\n\t[374] link 'Poetry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Poetry\n\t[375] link 'Sculpture' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sculpture\n\t[376] link 'Theatre' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Theatre\n\t[378] heading 'Geography'\n\t[380] link 'Africa' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Africa\n\t[381] link 'Antarctica' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Antarctica\n\t[382] link 'Arctic' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Arctic\n\t[383] link 'Asia' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Asia\n\t[384] link 'Caribbean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Caribbean\n\t[385] link 'Central America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Central_America\n\t[386] link 'Europe' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Europe\n\t[387] link 'Latin America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Latin_America\n\t[388] link 'Mediterranean' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mediterranean\n\t[389] link 'Middle East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_East\n\t[390] link 'North America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/North_America\n\t[391] link 'Oceania' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Oceania\n\t[392] link 'South America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/South_America\n\t[393] link 'Cartography' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Cartography\n\t[395] heading 'History'\n\t[397] link 'Ancient Egypt' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Egypt\n\t[398] link 'Ancient Greece' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Greece\n\t[399] link 'Ancient Near East' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Near_East\n\t[400] link 'Ancient Rome' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ancient_Rome\n\t[401] link 'Archaeology' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Archaeology\n\t[402] link 'British Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/British_Empire\n\t[294] link 'Byzantine Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Byzantine_Empire\n\t[403] link 'Colonialism' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Colonialism\n\t[404] link 'Crusades' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Crusades\n\t[405] link 'Heraldry' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heraldry\n\t[406] link 'History of science' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_science\n\t[407] link 'Imperial China' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_China#Imperial_China\n\t[408] link 'Indian independence movement' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Indian_independence_movement\n\t[409] link 'Japan' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_Japan\n\t[410] link 'Middle Ages' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Middle_Ages\n\t[411] link 'Mughal Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mughal_Empire\n\t[412] link 'Ottoman Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Ottoman_Empire\n\t[413] link 'Russian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Russian_Empire\n\t[414] link 'Sasanian Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Sasanian_Empire\n\t[415] link 'Seljuk Empire' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Seljuk_Empire\n\t[416] link 'Soviet Union' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Soviet_Union\n\t[417] link 'War' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/War\n\t[419] heading 'Sciences'\nHISTORY_ACTION: \n\n"}, {"role": "assistant", "content": "<think>The objective is to find the name of the Second World War trilogy written by Evelyn Waugh. The current page has a search textbox with id [329] labeled \"Search 'Wikipedia'\". The most efficient way to find the answer is to search for \"Evelyn Waugh\" in Wikipedia, as his article should mention his major works, including the WWII trilogy. I will type \"Evelyn Waugh\" into the search box and press enter.</think>\n```type [329] [Evelyn Waugh] [1]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: Which Wisconsin Badgers player won a Heismann trophy and was elected to the Pro Bowl in each of their first four seasons?\nObservation: [13584] RootWebArea 'Alan Ameche' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche\n\t[13622] textbox \"Search 'Wikipedia'\" required: False\n\t[13628] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[13629] button '🏠'\n\t[13630] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[13631] button 'Wikipedia'\n\t[13632] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[13633] button '🎲'\n\t[13640] heading 'Alan Ameche' hasPopup: menu\n\t[13027] StaticText 'Alan Ameche (/əmiːtʃi/; June 1, 1933 – August 8, 1988), nicknamed \"The Iron Horse\", or simply \"The Horse\", was an '\n\t[13655] link 'American football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/American_football\n\t[13041] StaticText ' player who played six seasons with the '\n\t[13656] link 'Baltimore Colts' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/History_of_the_Baltimore_Colts\n\t[13043] StaticText ' in the '\n\t[13657] link 'National Football League' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/National_Football_League\n\t[13045] StaticText ' (NFL). He played '\n\t[13658] link 'college football' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/College_football\n\t[13047] StaticText ' at the '\n\t[13659] link 'University of Wisconsin–Madison' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wisconsin_Badgers_football\n\t[13049] StaticText ' and won the '\n\t[13660] link 'Heisman Trophy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heisman_Trophy\n\t[13051] StaticText ' during his senior season in '\n\t[13661] link '1954' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1954_Wisconsin_Badgers_football_team\n\t[13053] StaticText '. He was elected to the '\n\t[13662] link 'Pro Bowl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Pro_Bowl\n\t[13055] StaticText ' in each of his first four seasons in the league. He is famous for scoring the winning touchdown in overtime in the '\n\t[13663] link '1958 NFL Championship Game' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1958_NFL_Championship_Game\n\t[13057] StaticText ' against the '\n\t[13664] link 'New York Giants' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1958_New_York_Giants_season\n\t[13059] StaticText ', labeled \"The Greatest Game Ever Played.\"'\n\t[13666] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-mscwiot-1\n\t[13669] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-everplyd-2\n\t[13673] table 'Alan Ameche'\n\t\t[13674] caption ''\n\t\t[13676] row ''\n\t\t\t[13677] cell ''\n\t\t\t\t[13585] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/Alan_Ameche_1960.jpg.webp\n\t\t[13679] row ''\n\t\t\t[13680] columnheader 'No. 35' required: False\n\t\t[13681] row ''\n\t\t\t[13682] rowheader 'Position:' required: False\n\t\t\t[13683] cell 'Fullback'\n\t\t\t\t[13684] link 'Fullback' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fullback_(American_football)\n\t\t[13685] row ''\n\t\t\t[13686] columnheader 'Personal information' required: False\n\t\t[13687] row ''\n\t\t\t[13688] rowheader 'Born:' required: False\n\t\t\t[13689] cell 'June 1, 1933 Kenosha, Wisconsin, U.S.'\n\t\t\t\t[13692] link 'Kenosha, Wisconsin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Kenosha%2C_Wisconsin\n\t\t[13693] row ''\n\t\t\t[13694] rowheader 'Died:' required: False\n\t\t\t[13695] cell 'August 8, 1988 (aged\\xa055) Houston, Texas, U.S.'\n\t[13189] StaticText 'With colleague and former Colts teammate '\n\t[13812] link 'Gino Marchetti' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gino_Marchetti\n\t[13191] StaticText ', Ameche founded the '\n\t[13587] link \"Gino's Hamburgers\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Gino's_Hamburgers\n\t[13193] StaticText ' chain.'\n\t[13814] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-aaatyp-3\n\t[13816] group ''\n\t\t[13821] DisclosureTriangle 'Early life' expanded: True\n\t\t\t[13822] heading 'Early life'\n\t\t[13197] StaticText 'Born in '\n\t\t[13824] link 'Kenosha, Wisconsin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Kenosha%2C_Wisconsin\n\t\t[13199] StaticText ', as Lino Dante Amici to Italian immigrant parents who came to the United States in the late 1920s, although they returned for a year to '\n\t\t[13826] link 'Italy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Italy\n\t\t[13203] StaticText ' during his childhood. The family then returned to Kenosha, where he attended '\n\t\t[13827] link 'Bradford High School' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Mary_D._Bradford_High_School\n\t\t[13205] StaticText '. Ameche was a cousin of actor brothers '\n\t\t[13828] link 'Don' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Don_Ameche\n\t\t[13207] StaticText ' and '\n\t\t[13829] link 'Jim Ameche' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Jim_Ameche\n\t\t[13831] link '[]' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Alan_Ameche#cite_note-4\n\t[13833] group ''\n\t\t[13838] DisclosureTriangle 'College career' expanded: True\n\t\t\t[13839] heading 'College career'\n\t\t[13212] StaticText 'Ameche earned consensus '\n\t\t[13841] link 'All-America' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1954_College_Football_All-America_Team\n\t\t[13214] StaticText ' honors at the '\n\t\t[13842] link 'University of Wisconsin–Madison' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/University_of_Wisconsin%E2%80%93Madison\n\t\t[13216] StaticText ', where he played '\n\t\t[13843] link 'linebacker' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Linebacker\n\t\t[13218] StaticText ' as well as '\n\t\t[13844] link 'fullback' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Fullback_(American_football)\n\t\t[13845] link 'single-platoon' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/One-platoon_system\n\t\t[13222] StaticText ' days. In four years as a '\n\t\t[13846] link 'Badger' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Wisconsin_Badgers_football\n\t\t[13847] link 'bowl game' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Bowl_game\n\t\t[13226] StaticText ', the '\n\t\t[13848] link '1953 Rose Bowl' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/1953_Rose_Bowl\n\t\t[13228] StaticText ', as a sophomore, rushing for 133 yards on 28 carries. Ameche won the '\n\t\t[13849] link 'Heisman Trophy' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Heisman_Trophy\nHISTORY_ACTION: type [328] [Wisconsin Badgers Heisman Trophy winners] [1]\nclick [1391] [Wisconsin Badgers football]\nclick [4874] [Alan Ameche]\n"}, {"role": "assistant", "content": "<think>The page provides detailed information about Alan Ameche, including that he won the Heisman Trophy while at the University of Wisconsin–Madison and was elected to the Pro Bowl in each of his first four NFL seasons. This matches the user's query exactly.</think>\n```stop [Alan Ameche is the Wisconsin Badgers player who won a Heisman Trophy and was elected to the Pro Bowl in each of his first four seasons.]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who is the main character in that 70s show\nObservation: [1388] RootWebArea \"That '70s Show\" focused: True\n\t[1913] textbox \"Search 'Wikipedia'\" required: False\n\t[1915] link 'Go to welcome page'\n\t\t[1921] button '🏠'\n\t[1916] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1923] button 'Wikipedia'\n\t[1917] link 'Go to a randomly selected page'\n\t\t[1925] button '🎲'\n\t[1927] heading \"That '70s Show\" hasPopup: menu\n\t[1944] link 'period'\n\t[1946] link 'sitcom'\n\t[1947] StaticText ' that aired on '\n\t[1948] link 'Fox'\n\t[1949] StaticText ' from August 23, 1998, to May 18, 2006. The series focuses on the lives of a group of six teenage friends living in fictional Point Place, Wisconsin, from May 17, 1976, to December 31, 1979.'\n\t[1408] link '[]'\n\t[1599] table ''\n\t\t[2729] row ''\n\t\t\t[2748] columnheader \"That '70s Show\" required: False\n\t\t[2730] row ''\n\t\t\t[2749] gridcell '' required: False\n\t\t[1601] row ''\n\t\t\t[2750] rowheader 'Genre' required: False\n\t\t\t[1602] gridcell 'Period sitcom Teen sitcom' required: False\n\t\t\t\t[1606] ListMarker '■ '\n\t\t\t\t[3222] link 'Period'\n\t\t\t\t[3224] link 'sitcom'\n\t\t\t\t[1608] ListMarker '■ '\n\t\t\t\t[3227] link 'Teen sitcom'\n\t\t[1609] row ''\n\t\t\t[2751] rowheader 'Created by' required: False\n\t\t\t[1610] gridcell 'Bonnie Turner Terry Turner Mark Brazill' required: False\n\t\t\t\t[1614] ListMarker '■ '\n\t\t\t\t[3230] link 'Bonnie Turner'\n\t\t\t\t[1616] ListMarker '■ '\n\t\t\t\t[3232] link 'Terry Turner'\n\t\t\t\t[1618] ListMarker '■ '\n\t\t\t\t[3234] link 'Mark Brazill'\n\t\t[1619] row ''\n\t\t\t[2752] rowheader 'Directed by' required: False\n\t\t\t[1620] gridcell 'David Trainer Terry Hughes (pilot)' required: False\n\t\t\t\t[1624] ListMarker '■ '\n\t\t\t\t[3237] link 'David Trainer'\n\t\t\t\t[1626] ListMarker '■ '\n\t\t\t\t[3239] link 'Terry Hughes'\n\t\t\t\t[3240] StaticText ' (pilot)'\n\t\t[1627] row ''\n\t\t\t[2753] rowheader 'Starring' required: False\n\t\t\t[1628] gridcell 'Topher Grace Mila Kunis Ashton Kutcher Danny Masterson Laura Prepon Wilmer Valderrama Debra Jo Rupp Kurtwood Smith Tanya Roberts Don Stark Lisa Robin Kelly Tommy Chong Josh Meyers' required: False\n\t\t\t\t[1632] ListMarker '■ '\n\t\t\t\t[3243] link 'Topher Grace'\n\t\t\t\t[1634] ListMarker '■ '\n\t\t\t\t[3245] link 'Mila Kunis'\n\t\t\t\t[1636] ListMarker '■ '\n\t\t\t\t[3247] link 'Ashton Kutcher'\n\t[1950] StaticText 'The main teenage cast members were '\n\t[1951] link 'Topher Grace'\n\t[1953] link 'Mila Kunis'\n\t[1955] link 'Ashton Kutcher'\n\t[1957] link 'Danny Masterson'\n\t[1959] link 'Laura Prepon'\n\t[1961] link 'Wilmer Valderrama'\n\t[1962] StaticText ', and '\n\t[1963] link 'Lisa Robin Kelly'\n\t[1964] StaticText '. The main adult cast members were '\n\t[1965] link 'Debra Jo Rupp'\n\t[1967] link 'Kurtwood Smith'\n\t[1969] link 'Don Stark'\n\t[1971] link 'Tommy Chong'\n\t[1972] StaticText ', and '\n\t[1973] link 'Tanya Roberts'\n\t[1975] StaticText 'In 1999, the show was remade by the '\n\t[1976] link 'ITV'\n\t[1977] StaticText ' network in the United Kingdom as '\n\t[1979] link 'Days Like These'\n\t[1980] StaticText ' using almost '\n\t[1981] link 'verbatim'\n\t[1982] StaticText ' scripts with minor changes to cultural references.'\n\t[1412] link '[]'\n\t[1414] Details ''\n\t\t[1694] DisclosureTriangle 'Cast' expanded: True\n\t\t\t[1983] heading 'Cast'\n\t\t[1985] note ''\n\t\t\t[1986] StaticText 'See also: '\n\t\t\t[1987] link \"List of That '70s, '80s, and '90s Show characters\"\n\t\t[1416] Details ''\n\t\t\t[1697] DisclosureTriangle 'Young cast' expanded: True\n\t\t\t\t[2805] heading 'Young cast'\n\t\t\t[1700] ListMarker '■ '\n\t\t\t[3376] link 'Topher Grace'\n\t\t\t[3378] link 'Eric Forman'\n\t\t\t[3379] StaticText ' (seasons 1–7; special guest season 8): Eric is a nice guy, geeky, physically slight and somewhat clumsy. He is a smart-ass with a fast wit and a deadpan sense of humor. His obsession with movies, particularly '\n\t\t\t[3381] link 'Star Wars'\n\t\t\t[3382] StaticText ', is often referenced throughout the show. For seven seasons Eric is in a relationship with his longtime love and neighbor '\n\t\t\t[3383] link 'Donna Pinciotti'\n\t\t\t[3384] StaticText '. His father, Red, is always hard on him. He convinces his parents to let his best friend '\n\t\t\t[3385] link 'Steven Hyde'\n\t\t\t[3386] StaticText ' move in with them, making Hyde like a brother. He decides to become a teacher after high school and he leaves the series at the end of the seventh season to teach in '\nHISTORY_ACTION: type [21] [That '70s Show] [1]\nclick [574] [link \"That '70s Show\"]\n"}, {"role": "assistant", "content": "<think>The section under \"Young cast\" describes Eric Forman, played by Topher Grace, in detail and refers to his central role, indicating he is the main character. I can now answer the question.</think>\n```stop [The main character in That '70s Show is Eric Forman, played by Topher Grace.]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who voices the main character in the lego movie\nObservation: [3271] RootWebArea 'The Lego Movie (franchise)' focused: True url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie_(franchise)\n\t[3310] textbox \"Search 'Wikipedia'\" required: False\n\t[3316] link 'Go to welcome page' url: http://localhost:22015/\n\t\t[3317] button '🏠'\n\t[3318] link \"Go to the main page of 'Wikipedia'\" url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/\n\t\t[3319] button 'Wikipedia'\n\t[3320] link 'Go to a randomly selected page' url: http://localhost:22015/random?content=wikipedia_en_all_maxi_2022-05\n\t\t[3321] button '🎲'\n\t[3328] heading 'The Lego Movie (franchise)' hasPopup: menu\n\t[2722] StaticText 'The Lego Movie is an American '\n\t[3335] link 'media franchise' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Media_franchise\n\t[2725] StaticText ' based on '\n\t[3336] link 'Lego' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Lego\n\t[3337] link 'construction toys' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Construction_toy\n\t[2729] StaticText '. It began with the 2014 film '\n\t[3339] link 'The Lego Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie\n\t[2731] StaticText ', which was directed and written by '\n\t[3340] link 'Phil Lord and Christopher Miller' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Phil_Lord_and_Christopher_Miller\n\t[2733] StaticText '. The success of the film led to the release of two licensed video games, a '\n\t[3341] link '4D film theme park attraction' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie%3A_4D_%E2%80%93_A_New_Adventure\n\t[2735] StaticText ', two spin-off films titled '\n\t[3343] link 'The Lego Batman Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Batman_Movie\n\t[2737] StaticText ' and '\n\t[3345] link 'The Lego Ninjago Movie' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Ninjago_Movie\n\t[2739] StaticText ', which were released in 2017, '\n\t[3347] link 'Unikitty!' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Unikitty!\n\t[2741] StaticText ' an animated television series that also came out in the same year, and the sequel to the original film titled '\n\t[3349] link 'The Lego Movie 2: The Second Part' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Lego_Movie_2%3A_The_Second_Part\n\t[2743] StaticText ' in 2019. Plans for a third spin-off film and a sequel to The Lego Batman Movie were later shelved. The franchise has received a generally positive critical reception, with the exception of The Lego Ninjago Movie which received mixed reviews, and has grossed $1 billion at the worldwide box office. While '\n\t[3352] link 'Warner Bros.' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Warner_Bros.\n\t[2749] StaticText ' still owned the rights to their run of the franchise from 2014 to 2020, including the Unikitty! animated television series, '\n\t[3354] link 'Universal Pictures' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Universal_Pictures\n\t[3358] table ''\n\t\t[3360] row ''\n\t\t\t[3361] columnheader 'The Lego Movie' required: False\n\t\t[3362] row ''\n\t\t\t[3363] cell 'Logo to the first film.'\n\t\t\t\t[3365] image '' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/I/The_Lego_Movie_logo.png.webp\n\t\t[3367] row ''\n\t\t\t[3368] rowheader 'Created by' required: False\n\t\t\t[3369] cell 'Dan Lin Dan Hageman Kevin Hageman Roy Lee Phil Lord Christopher Miller'\n\t\t\t\t[2756] ListMarker '■ '\n\t\t\t\t[3373] link 'Dan Lin' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/Dan_Lin\n\t\t\t\t[2757] ListMarker '■'\n\t\t\t\t[3375] link 'Dan Hageman Kevin Hageman' url: http://localhost:22015/wikipedia_en_all_maxi_2022-05/A/The_Hageman_Brothers\n\t\t\t\t\t[2758] ListMarker '■ '\n\t\t\t\t\t[2759] ListMarker '■ '\n\t[3462] group ''\n\t\t[3467] DisclosureTriangle 'Films' expanded: True\n\t\t\t[3468] heading 'Films'\nHISTORY_ACTION: type [329] [The Lego Movie] [1]\nclick [2005] [The Lego Movie (franchise)]\n"}, {"role": "assistant", "content": "<think>The user's objective is to find out who voices the main character in The Lego Movie. On the current page, there is a link to \"The Lego Movie\" ([3339]). Clicking this link should lead to a page specifically about the film, where cast information is likely to be found.</think>\n```click [3339] [The Lego Movie]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: girl fox name on fox and the hound\nObservation: [1816] RootWebArea 'The Fox and the Hound' focused: True\n\t[2460] textbox \"Search 'Wikipedia'\" required: False\n\t[2462] link 'Go to welcome page'\n\t\t[2468] button '🏠'\n\t[2463] link \"Go to the main page of 'Wikipedia'\"\n\t\t[2470] button 'Wikipedia'\n\t[2464] link 'Go to a randomly selected page'\n\t\t[2472] button '🎲'\n\t[1829] table ''\n\t\t[4619] row ''\n\t\t\t[4628] gridcell 'Original theatrical release poster' required: False\n\t\t[2114] row ''\n\t\t\t[4629] rowheader 'Directed by' required: False\n\t\t\t[2115] gridcell 'Ted Berman Richard Rich Art Stevens' required: False\n\t\t\t\t[2119] ListMarker '■ '\n\t\t\t\t[5171] link 'Ted Berman'\n\t\t\t\t[2121] ListMarker '■ '\n\t\t\t\t[5173] link 'Richard Rich'\n\t\t\t\t[2123] ListMarker '■ '\n\t\t\t\t[5175] link 'Art Stevens'\n\t\t[2124] row ''\n\t\t\t[4630] rowheader 'Story by' required: False\n\t\t\t[2125] gridcell 'Larry Clemmons Ted Berman David Michener Peter Young Burny Mattinson Steve Hulett Earl Kress Vance Gerry' required: False\n\t\t\t\t[2129] ListMarker '■ '\n\t\t\t\t[5178] link 'Larry Clemmons'\n\t\t\t\t[2131] ListMarker '■ '\n\t\t\t\t[5180] StaticText 'Ted Berman'\n\t\t\t\t[2133] ListMarker '■ '\n\t\t\t\t[5181] StaticText 'David Michener'\n\t\t\t\t[2135] ListMarker '■ '\n\t\t\t\t[5182] StaticText 'Peter Young'\n\t\t\t\t[2137] ListMarker '■ '\n\t\t\t\t[5183] link 'Burny Mattinson'\n\t\t\t\t[2139] ListMarker '■ '\n\t\t\t\t[5185] StaticText 'Steve Hulett'\n\t\t\t\t[2141] ListMarker '■ '\n\t\t\t\t[5186] StaticText 'Earl Kress'\n\t\t\t\t[2143] ListMarker '■ '\n\t\t\t\t[5187] link 'Vance Gerry'\n\t\t[4620] row ''\n\t\t\t[4631] rowheader 'Based on' required: False\n\t\t\t[4632] gridcell 'The Fox and the Hound by Daniel P. Mannix' required: False\n\t\t\t\t[5191] link 'The Fox and the Hound'\n\t\t\t\t[5194] link 'Daniel P. Mannix'\n\t\t[2144] row ''\n\t\t\t[4633] rowheader 'Produced by' required: False\n\t\t\t[2145] gridcell 'Ron Miller Wolfgang Reitherman Art Stevens' required: False\n\t\t\t\t[2149] ListMarker '■ '\n\t\t\t\t[5198] link 'Ron Miller'\n\t\t\t\t[2151] ListMarker '■ '\n\t\t\t\t[5200] link 'Wolfgang Reitherman'\n\t\t\t\t[2153] ListMarker '■ '\n\t\t\t\t[5202] StaticText 'Art Stevens'\n\t\t[2154] row ''\n\t\t\t[4634] rowheader 'Starring' required: False\n\t\t\t[2155] gridcell 'Mickey Rooney Kurt Russell Pearl Bailey Jack Albertson Sandy Duncan Jeanette Nolan Pat Buttram John Fiedler John McIntire Dick Bakalyan Paul Winchell Keith Mitchell Corey Feldman' required: False\n\t\t\t\t[2159] ListMarker '■ '\n\t\t\t\t[5204] link 'Mickey Rooney'\n\t\t\t\t[2161] ListMarker '■ '\n\t\t\t\t[5206] link 'Kurt Russell'\n\t\t\t\t[2163] ListMarker '■ '\n\t\t\t\t[5208] link 'Pearl Bailey'\n\t\t\t\t[2165] ListMarker '■ '\n\t\t\t\t[5210] link 'Jack Albertson'\n\t[2563] StaticText 'Walt Disney Productions first obtained the film rights to the novel by Daniel P. Mannix in 1967; however, actual development on the film would not occur until spring 1977. The Fox and the Hound marked the last involvement of the remaining members of '\n\t[2567] link \"Disney's Nine Old Men\"\n\t[2568] StaticText ', which included '\n\t[2569] link 'Frank Thomas'\n\t[2570] StaticText ' and '\n\t[2571] link 'Ollie Johnston'\n\t[2572] StaticText '. Though the old animators had involvement in early development of the film, the film was ultimately handed over to a new generation of animators following the retirement of the old animators. As such, it was the first film for future directors including '\n\t[2573] link 'Tim Burton'\n\t[2575] link 'Brad Bird'\n\t[2576] StaticText ', and '\n\t[2577] link 'John Lasseter'\n\t[2579] link 'Don Bluth'\n\t[2580] StaticText ' and his team of animators. Further concerns were raised over the handling of the scene in which Chief is hit by a train, which was changed from his death into a non-fatal injury by which he merely suffers a broken leg.'\n\t[2582] StaticText 'The Fox and the Hound was released to theaters on July 10, 1981 by '\n\t[2584] link 'Buena Vista Distribution'\n\t[2585] StaticText '. The film was a financial success, becoming the 14th highest-grossing film of the year and earning $39.9 million in the United States. However, it initially received mixed reviews from critics, who praised the animation and voice acting of the film, but believed that the film was not groundbreaking enough. Over time, however, the film became known as an underrated classic, and it was nominated for three awards, of which it won one. At the time of its release, it was the most expensive animated film produced to date, costing $12 million.'\n\t[1843] link '[]'\n\t[2586] StaticText ' It was re-released to theaters on March 25, 1988.'\n\t[1846] link '[]'\n\t[2587] StaticText ' An intermediate follow-up, '\n\t[2589] link 'The Fox and the Hound 2'\n\t[2590] StaticText ', was released '\n\t[2591] link 'directly-to-DVD'\n\t[2592] StaticText ' on December 12, 2006.'\n\t[2198] Details ''\n\t\t[2200] DisclosureTriangle 'Plot' expanded: True\n\t\t\t[2593] heading 'Plot'\n\t\t[2598] StaticText 'After a young '\n\t\t[2599] link 'red fox'\n\t\t[2600] StaticText ' is orphaned, Big Mama the '\n\t\t[2601] link 'owl'\n\t\t[2602] StaticText ' and her friends, Dinky the '\n\t\t[2603] link 'finch'\n\t\t[2604] StaticText ' and Boomer the '\n\t\t[2605] link 'woodpecker'\n\t\t[2606] StaticText ', arrange for him to be adopted by kindly farmer Widow Tweed, who names him Tod. Meanwhile, her neighbor, hunter Amos Slade, brings home a young '\nHISTORY_ACTION: type [21] [fox and the hound] [1]\nclick [570] [link 'The Fox and the Hound']\nscroll [down]\n"}, {"role": "assistant", "content": "<think>The plot section has just started, but the answer may be further down. I should continue scrolling to find the name of the girl fox character.</think>\n```scroll [down]```"}]}
+{"messages": [{"role": "system", "content": "You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.\n\nHere's the information you'll have:\nThe user's objective: This is the task you're trying to complete.\nThe current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.\nThe open tabs: These are the tabs you have open.\nThe previous actions: There are the actions you just performed. It may be helpful to track your progress.\n\nThe actions you can perform fall into several categories:\n\nPage Operation Actions:\n`click [id] [content]`: This action clicks on an element with a specific id on the webpage.\n`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the \"\"Enter\"\" key is pressed after typing unless press_enter_after is set to 0.\n`hover [id] [content]`: Hover over an element with id.\n`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).\n`scroll [down|up]`: Scroll the page up or down.\n\nTab Management Actions:\n`new_tab`: Open a new, empty browser tab.\n`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.\n`close_tab`: Close the currently active tab.\n\nURL Navigation Actions:\n`goto [url]`: Navigate to a specific URL.\n`go_back`: Navigate to the previously viewed page.\n`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).\n\nCompletion Action:\n`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as \"\"N/A\"\" in the bracket.\n\nTo be successful, it is very important to follow the following rules:\n1. You should only issue an action that is valid given the current observation.\n2. You should only issue one action at a time.\n3. You should follow the examples to reason step by step and then issue the next action.\n4. You should refer to historical actions when issue an action and try not to make repetitive actions\n5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.\n6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:\n   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>\n   ```click [id] [content]```\n7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.\n8. Always format actions correctly: \n```command [parameters]```\nFor example, if searching for \"\"death row inmates in the US\"\" in a search field with ID `21`, correctly format it as:\n```type [21] [death row inmates in the US] [1]```\nAvoid incorrect formats that omit brackets around parameters or numeric values.\n"}, {"role": "user", "content": "\nObjective: who sang with you i'm born again\nObservation: [1219] RootWebArea \"With You I'm Born Again\" focused: True\n\t[1532] textbox \"Search 'Wikipedia'\" required: False\n\t[1534] link 'Go to welcome page'\n\t\t[1540] button '🏠'\n\t[1535] link \"Go to the main page of 'Wikipedia'\"\n\t\t[1542] button 'Wikipedia'\n\t[1536] link 'Go to a randomly selected page'\n\t\t[1544] button '🎲'\n\t[1546] heading \"With You I'm Born Again\" hasPopup: menu\n\t[1556] StaticText '\"With You I\\'m Born Again\" is a 1979 '\n\t[1560] link 'duet'\n\t[1561] StaticText ' written by '\n\t[1562] link 'Carol Connors'\n\t[1563] StaticText ' and '\n\t[1564] link 'David Shire'\n\t[1565] StaticText ' that originated on the soundtrack of the 1979 motion picture '\n\t[1567] link 'Fast Break'\n\t[1568] StaticText '. It was performed by '\n\t[1569] link 'Motown'\n\t[1570] StaticText ' recording artists '\n\t[1571] link 'Billy Preston'\n\t[1572] StaticText ' and '\n\t[1573] link 'Syreeta Wright'\n\t[1574] StaticText ' (credited as Syreeta) and became an international hit for the duo, reaching number four on the '\n\t[1575] link 'Billboard Hot 100'\n\t[1230] link '[]'\n\t[1576] StaticText ' and number two on the UK singles chart.'\n\t[1233] link '[]'\n\t[1235] table ''\n\t\t[1991] row ''\n\t\t\t[1997] columnheader '\"With You I\\'m Born Again\"' required: False\n\t\t[1992] row ''\n\t\t\t[1998] gridcell '' required: False\n\t\t[1993] row ''\n\t\t\t[1999] columnheader 'Single by Billy Preston and Syreeta Wright' required: False\n\t\t\t\t[2095] link 'Single'\n\t\t\t\t[2097] link 'Billy Preston'\n\t\t\t\t[2099] link 'Syreeta Wright'\n\t\t[1994] row ''\n\t\t\t[2000] columnheader 'from the album Fast Break' required: False\n\t\t[1237] row ''\n\t\t\t[2001] rowheader 'B-side' required: False\n\t\t\t\t[2107] link 'B-side'\n\t\t\t[1238] gridcell '\"All I Wanted Was You\"[]' required: False\n\t\t\t\t[1240] link '[]'\n\t\t[1242] row ''\n\t\t\t[2002] rowheader 'Released' required: False\n\t\t\t[1243] gridcell 'December 1979[]' required: False\n\t\t\t\t[1245] link '[]'\n\t\t[1995] row ''\n\t\t\t[2003] rowheader 'Genre' required: False\n\t\t\t\t[2121] link 'Genre'\n\t\t\t[2004] gridcell 'Soul' required: False\n\t\t\t\t[2123] link 'Soul'\n\t\t[1247] row ''\n\t\t\t[2005] rowheader 'Label' required: False\n\t\t\t\t[2125] link 'Label'\n\t\t\t[1248] gridcell 'Motown[]' required: False\n\t\t\t\t[2127] link 'Motown'\n\t\t\t\t[1250] link '[]'\n\t\t[1996] row ''\n\t\t\t[2006] rowheader 'Songwriter(s)' required: False\n\t\t\t\t[2134] link 'Songwriter(s)'\n\t\t\t[2007] gridcell 'Carol Connors · David Shire' required: False\n\t\t\t\t[2140] link 'Carol Connors'\n\t\t\t\t[2144] link 'David Shire'\n\t\t[1252] row ''\n\t\t\t[2008] rowheader 'Producer(s)' required: False\n\t\t\t\t[2147] link 'Producer(s)'\n\t\t\t[1253] gridcell 'James DiPasquale[]' required: False\n\t\t\t\t[2149] link 'James DiPasquale'\n\t\t\t\t[1255] link '[]'\n\t[1257] Details ''\n\t\t[1429] DisclosureTriangle 'History' expanded: True\n\t\t\t[1577] heading 'History'\n\t\t[1579] StaticText 'The music from '\n\t\t[1581] link 'Fast Break'\n\t\t[1582] StaticText ' was credited to '\n\t\t[1583] link 'David Shire'\n\t\t[1584] StaticText ' and '\n\t\t[1585] link 'James DiPasquale'\n\t\t[1586] StaticText ', and '\n\t\t[1587] link 'Carol Connors'\n\t\t[1588] StaticText ' put words to four of the songs on the soundtrack album. Connors recalls, \"I had written most of the lyrics to \\'With You I\\'m Born Again\\' in 22 minutes, but I couldn\\'t find two words and it was driving me crazy.\"'\n\t\t[1261] link '[]'\n\t\t[1589] StaticText ' The right words came to her on her way to an ice cream parlor, where she requested a pencil and paper along with her order. \"I had finally found it—the line I was seeking--\\'Come show me your kindness, in your arms I know I\\'ll find this\\'—and I wanted to get it down on paper.\"'\n\t\t[1264] link '[]'\n\t\t[1590] link 'Motown'\n\t\t[1591] StaticText ' executive '\n\t\t[1592] link 'Suzanne de Passe'\n\t\t[1268] link '[]'\n\t\t[1594] StaticText ' and each singer also had a solo number to cut for the soundtrack album as well. Connors recalled, \"Billy and Syreeta were originally not very fond of the idea of recording together—they each wanted to do their own thing, but this worked out extremely well for both of them.\"'\n\t\t[1271] link '[]'\n\t\t[1595] StaticText ' The reluctant pair were surprised by the success of \"With You I\\'m Born Again\". \"Though both liked the song, they never dreamed of it being a hit single.\"'\n\t\t[1274] link '[]'\n\t\t[1596] StaticText 'Connors noted the impression some were left with by the song and Preston. \"When David Shire heard Billy Preston play the song on the organ, he said to me, \\'Now I know why he\\'s a star.\\'\" Connors also shared, \"The late great '\nHISTORY_ACTION: type [21] [With You I'm Born Again] [1]\nclick [570] [link \"With You I'm Born Again\"]\n"}, {"role": "assistant", "content": "<think>The answer can be found in the static text: '\"With You I'm Born Again\"... It was performed by Motown recording artists Billy Preston and Syreeta Wright (credited as Syreeta) and became an international hit for the duo.' The singers are Billy Preston and Syreeta Wright.</think>\n```stop [Billy Preston and Syreeta Wright]```"}]}

qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/args.json ADDED Viewed

	@@ -0,0 +1,380 @@

+{
+  "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 4,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 2,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 1e-06,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 1.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": null,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 20.0,
+  "dataloader_num_workers": 48,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "channels": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "Qwen/Qwen2.5-7B",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 16240,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "corr_nq_1500q_hotpot_1500q_new600q_swift.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.001,
+  "dataset_num_proc": 100,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": false,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": false,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "Qwen2.5-7B",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f57c7e2ce50>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/usr/local/app/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=20.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v0-20250821-164344', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

qwen2.5-7b-base-1500q-1500q-600q-1e_6-old-click-1ep/checkpoint-193/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}