diff --git "a/scripts/nohup.out" "b/scripts/nohup.out"
new file mode 100644--- /dev/null
+++ "b/scripts/nohup.out"
@@ -0,0 +1,20571 @@
+++++ readlink -f sft_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/sft_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ model_name=Qwen3-4B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl
++ per_device_train_batch_size=12
++ gradient_accumulation_steps=2
++ max_lengths=1024
++ num_train_epochs=1
++ task=sft_0915
++ tag=base
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base
++ cp sft_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/train.log
++ swift sft --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --load_from_cache_file --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 12 --per_device_eval_batch_size 12 --learning_rate 2e-5 --gradient_accumulation_steps 2 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 0.1 --save_steps 0.1 --logging_steps 10 --max_length 1024 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base --create_checkpoint_symlink --warmup_ratio 0.01 --dataloader_num_workers 8 --dataset_num_proc 16 --seed 42 --report_to tensorboard --save_only_model --save_total_limit 3 --ddp_timeout 180000000
+[2025-09-15 15:36:15,188] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/sft.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --load_from_cache_file --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 12 --per_device_eval_batch_size 12 --learning_rate 2e-5 --gradient_accumulation_steps 2 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 0.1 --save_steps 0.1 --logging_steps 10 --max_length 1024 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base --create_checkpoint_symlink --warmup_ratio 0.01 --dataloader_num_workers 8 --dataset_num_proc 16 --seed 42 --report_to tensorboard --save_only_model --save_total_limit 3 --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 15:36:22,192] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 15:36:22,226] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 15:36:22,374] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 15:36:22,449] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 15:36:22,469] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 15:36:22,573] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 15:36:22,579] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 15:36:22,612] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-15 15:36:23,813] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 15:36:23,813] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-09-15 15:36:24,038] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 15:36:24,126] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 15:36:24,517] [INFO] [comm.py:637:init_distributed] cdb=None
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=True,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl'],
+dataset_num_proc=16,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=0.1,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=2,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=1024,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=1.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base,
+overwrite_output_dir=False,
+packing=False,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=12,
+per_device_train_batch_size=12,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base,
+save_on_each_node=False,
+save_only_model=True,
+save_safetensors=True,
+save_steps=0.1,
+save_strategy=steps,
+save_total_limit=3,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.01,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] attn_impl: flash_attn
+[2025-09-15 15:36:24,653] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 15:36:24,668] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 15:36:24,689] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 15:36:24,692] [INFO] [comm.py:637:init_distributed] cdb=None
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.18s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.05it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.28s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.18s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.10it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.09it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.06it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.08s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.20s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.04s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.21s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.12s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.25s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.10s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.09s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.06s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.42it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.24it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.18it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.23it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.39it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.39it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.42it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.36it/s]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-15 15:36:27.432108
+[INFO:swift] swift.__version__: 3.7.3
+Setting num_proc from 16 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 68546 examples [00:00, 463780.71 examples/s]Generating train split: 100792 examples [00:00, 489243.40 examples/s]
+Map (num_proc=16):   0%|          | 0/100792 [00:00<?, ? examples/s]Map (num_proc=16):   5%|▍         | 5000/100792 [00:00<00:01, 49599.99 examples/s]Map (num_proc=16): 100%|██████████| 100792/100792 [00:00<00:00, 373250.01 examples/s]
+Setting num_proc from 16 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 500 examples [00:00, 212412.84 examples/s]
+Map (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]Map (num_proc=16): 100%|██████████| 500/500 [00:00<00:00, 3437.44 examples/s]
+[INFO:swift] train_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 100792
+})
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 500
+})
+Map (num_proc=16):   0%|          | 0/100792 [00:00<?, ? examples/s]Map (num_proc=16):   1%|          | 1000/100792 [00:00<01:38, 1012.16 examples/s]Map (num_proc=16):   3%|▎         | 3000/100792 [00:01<00:31, 3143.40 examples/s]Map (num_proc=16):   5%|▍         | 5000/100792 [00:01<00:19, 5001.51 examples/s]Map (num_proc=16):   7%|▋         | 7000/100792 [00:01<00:13, 7021.46 examples/s]Map (num_proc=16):  10%|▉         | 10000/100792 [00:01<00:08, 10423.67 examples/s]Map (num_proc=16):  13%|█▎        | 13000/100792 [00:01<00:06, 13673.64 examples/s]Map (num_proc=16):  16%|█▌        | 16000/100792 [00:01<00:05, 16363.04 examples/s]Map (num_proc=16):  19%|█▉        | 19000/100792 [00:01<00:04, 19369.47 examples/s]Map (num_proc=16):  23%|██▎       | 23000/100792 [00:02<00:03, 22533.25 examples/s]Map (num_proc=16):  27%|██▋       | 27000/100792 [00:02<00:02, 26079.20 examples/s]Map (num_proc=16):  30%|██▉       | 30000/100792 [00:02<00:02, 26786.17 examples/s]Map (num_proc=16):  34%|███▎      | 34000/100792 [00:02<00:02, 27740.96 examples/s]Map (num_proc=16):  39%|███▊      | 39000/100792 [00:02<00:01, 32180.95 examples/s]Map (num_proc=16):  43%|████▎     | 43000/100792 [00:02<00:01, 33490.30 examples/s]Map (num_proc=16):  47%|████▋     | 47000/100792 [00:02<00:01, 33190.20 examples/s]Map (num_proc=16):  51%|█████     | 51000/100792 [00:02<00:01, 32560.64 examples/s]Map (num_proc=16):  55%|█████▍    | 55000/100792 [00:02<00:01, 34378.00 examples/s]Map (num_proc=16):  59%|█████▊    | 59000/100792 [00:03<00:01, 34590.17 examples/s]Map (num_proc=16):  63%|██████▎   | 63000/100792 [00:03<00:01, 33280.19 examples/s]Map (num_proc=16):  66%|██████▋   | 67000/100792 [00:03<00:01, 27748.91 examples/s]Map (num_proc=16):  70%|███████   | 71000/100792 [00:03<00:01, 27552.89 examples/s]Map (num_proc=16):  73%|███████▎  | 74000/100792 [00:03<00:01, 26044.55 examples/s]Map (num_proc=16):  76%|███████▋  | 77000/100792 [00:03<00:01, 21680.31 examples/s]Map (num_proc=16):  83%|████████▎ | 84000/100792 [00:04<00:00, 29771.35 examples/s]Map (num_proc=16):  87%|████████▋ | 87500/100792 [00:04<00:00, 29816.84 examples/s]Map (num_proc=16):  90%|█████████ | 91100/100792 [00:04<00:00, 27846.23 examples/s]Map (num_proc=16):  94%|█████████▍| 94998/100792 [00:04<00:00, 22165.11 examples/s]Map (num_proc=16):  97%|█████████▋| 97596/100792 [00:04<00:00, 21214.91 examples/s]Map (num_proc=16): 100%|█████████▉| 100493/100792 [00:04<00:00, 18807.15 examples/s]Map (num_proc=16): 100%|██████████| 100792/100792 [00:05<00:00, 19458.08 examples/s]
+Map (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]Map (num_proc=16):   6%|▋         | 32/500 [00:00<00:07, 64.49 examples/s]Map (num_proc=16):  19%|█▉        | 96/500 [00:00<00:02, 167.60 examples/s]Map (num_proc=16):  32%|███▏      | 159/500 [00:00<00:01, 232.78 examples/s]Map (num_proc=16):  44%|████▍     | 221/500 [00:00<00:01, 277.56 examples/s]Map (num_proc=16):  57%|█████▋    | 283/500 [00:01<00:00, 305.65 examples/s]Map (num_proc=16):  69%|██████▉   | 345/500 [00:01<00:00, 308.21 examples/s]Map (num_proc=16):  81%|████████▏ | 407/500 [00:01<00:00, 368.47 examples/s]Map (num_proc=16):  94%|█████████▍| 469/500 [00:01<00:00, 342.14 examples/s]Map (num_proc=16): 100%|██████████| 500/500 [00:01<00:00, 276.08 examples/s]
+[INFO:swift] [INPUT_IDS] [151644, 872, 198, 27473, 279, 2701, 1467, 504, 6364, 1119, 8453, 510, 22574, 25, 1597, 773, 1657, 10488, 633, 13628, 979, 429, 8573, 624, 44923, 25, 151645, 198, 151644, 77091, 198, 39165, 106334, 99726, 13343, 3837, 49434, 239, 79478, 103939, 28726, 20726, 99555, 101135, 1773, 151645]
+[INFO:swift] [INPUT] <|im_start|>user
+Translate the following text from English into Chinese:
+English: And so many opportunities get missed when that happens.
+Chinese:<|im_end|>
+<|im_start|>assistant
+当这种情况发生时， 我们就会错失很多机会。<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 39165, 106334, 99726, 13343, 3837, 49434, 239, 79478, 103939, 28726, 20726, 99555, 101135, 1773, 151645]
+[INFO:swift] [LABELS] [-100 * 31]当这种情况发生时， 我们就会错失很多机会。<|im_end|>
+[INFO:swift] Dataset Token Length: 116.611973±73.344357, min=25.000000, max=781.000000, size=100792
+[INFO:swift] Dataset Token Length: 136.436000±75.772303, min=29.000000, max=509.000000, size=500
+[INFO:swift] The TrainArguments will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/args.json
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen3ForCausalLM: 4022.4681M Params (4022.4681M Trainable [100.0000%]), 0.0001M Buffers.
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/logging.jsonl
+Train:   0%|          | 0/525 [00:00<?, ?it/s]/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+[INFO:swift] use_logits_to_keep: True
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 1/525 [00:02<18:23,  2.11s/it]                                                      {'loss': 1.85508394, 'token_acc': 0.65154867, 'grad_norm': 19.76698494, 'learning_rate': 3.33e-06, 'memory(GiB)': 28.31, 'train_speed(iter/s)': 0.058972, 'epoch': 0.0, 'global_step/max_steps': '1/525', 'percentage': '0.19%', 'elapsed_time': '2s', 'remaining_time': '18m 25s'}
+Train:   0%|          | 1/525 [00:02<18:23,  2.11s/it]Train:   0%|          | 1/525 [00:02<18:23,  2.11s/it]Train:   0%|          | 2/525 [00:03<12:53,  1.48s/it]Train:   1%|          | 3/525 [00:04<11:56,  1.37s/it]Train:   1%|          | 4/525 [00:05<10:45,  1.24s/it]Train:   1%|          | 5/525 [00:06<10:13,  1.18s/it]Train:   1%|          | 6/525 [00:07<10:07,  1.17s/it]Train:   1%|▏         | 7/525 [00:08<09:51,  1.14s/it]Train:   2%|▏         | 8/525 [00:09<09:40,  1.12s/it]Train:   2%|▏         | 9/525 [00:10<09:27,  1.10s/it]Train:   2%|▏         | 10/525 [00:11<09:03,  1.05s/it]                                                       {'loss': 1.61825509, 'token_acc': 0.6575236, 'grad_norm': 4.48858261, 'learning_rate': 2e-05, 'memory(GiB)': 45.17, 'train_speed(iter/s)': 0.374902, 'epoch': 0.02, 'global_step/max_steps': '10/525', 'percentage': '1.90%', 'elapsed_time': '11s', 'remaining_time': '10m 8s'}
+Train:   2%|▏         | 10/525 [00:11<09:03,  1.05s/it]Train:   2%|▏         | 10/525 [00:11<09:03,  1.05s/it]Train:   2%|▏         | 11/525 [00:13<09:29,  1.11s/it]Train:   2%|▏         | 12/525 [00:14<09:10,  1.07s/it]Train:   2%|▏         | 13/525 [00:14<08:45,  1.03s/it]Train:   3%|▎         | 14/525 [00:15<08:44,  1.03s/it]Train:   3%|▎         | 15/525 [00:17<09:23,  1.11s/it]Train:   3%|▎         | 16/525 [00:18<09:22,  1.11s/it]Train:   3%|▎         | 17/525 [00:19<09:05,  1.07s/it]Train:   3%|▎         | 18/525 [00:20<09:26,  1.12s/it]Train:   4%|▎         | 19/525 [00:21<09:12,  1.09s/it]Train:   4%|▍         | 20/525 [00:22<09:00,  1.07s/it]                                                       {'loss': 1.42762375, 'token_acc': 0.69602699, 'grad_norm': 4.14555454, 'learning_rate': 1.996e-05, 'memory(GiB)': 55.66, 'train_speed(iter/s)': 0.533288, 'epoch': 0.04, 'global_step/max_steps': '20/525', 'percentage': '3.81%', 'elapsed_time': '22s', 'remaining_time': '9m 32s'}
+Train:   4%|▍         | 20/525 [00:22<09:00,  1.07s/it]Train:   4%|▍         | 20/525 [00:22<09:00,  1.07s/it]Train:   4%|▍         | 21/525 [00:23<08:58,  1.07s/it]Train:   4%|▍         | 22/525 [00:25<09:31,  1.14s/it]Train:   4%|▍         | 23/525 [00:26<09:29,  1.14s/it]Train:   5%|▍         | 24/525 [00:27<09:24,  1.13s/it]Train:   5%|▍         | 25/525 [00:28<09:15,  1.11s/it]Train:   5%|▍         | 26/525 [00:29<09:27,  1.14s/it]Train:   5%|▌         | 27/525 [00:30<09:11,  1.11s/it]Train:   5%|▌         | 28/525 [00:31<09:08,  1.10s/it]Train:   6%|▌         | 29/525 [00:32<09:14,  1.12s/it]Train:   6%|▌         | 30/525 [00:33<09:12,  1.12s/it]                                                       {'loss': 1.37146387, 'token_acc': 0.68209848, 'grad_norm': 4.22783041, 'learning_rate': 1.989e-05, 'memory(GiB)': 55.66, 'train_speed(iter/s)': 0.61504, 'epoch': 0.06, 'global_step/max_steps': '30/525', 'percentage': '5.71%', 'elapsed_time': '33s', 'remaining_time': '9m 19s'}
+Train:   6%|▌         | 30/525 [00:33<09:12,  1.12s/it]Train:   6%|▌         | 30/525 [00:33<09:12,  1.12s/it]Train:   6%|▌         | 31/525 [00:35<09:22,  1.14s/it]Train:   6%|▌         | 32/525 [00:36<08:57,  1.09s/it]Train:   6%|▋         | 33/525 [00:37<08:56,  1.09s/it]Train:   6%|▋         | 34/525 [00:38<08:53,  1.09s/it]Train:   7%|▋         | 35/525 [00:39<09:00,  1.10s/it]Train:   7%|▋         | 36/525 [00:40<08:48,  1.08s/it]Train:   7%|▋         | 37/525 [00:41<08:38,  1.06s/it]Train:   7%|▋         | 38/525 [00:42<09:05,  1.12s/it]Train:   7%|▋         | 39/525 [00:43<08:57,  1.11s/it]Train:   8%|▊         | 40/525 [00:44<08:57,  1.11s/it]                                                       {'loss': 1.30498199, 'token_acc': 0.7010929, 'grad_norm': 3.68515229, 'learning_rate': 1.979e-05, 'memory(GiB)': 55.66, 'train_speed(iter/s)': 0.669539, 'epoch': 0.08, 'global_step/max_steps': '40/525', 'percentage': '7.62%', 'elapsed_time': '44s', 'remaining_time': '9m 4s'}
+Train:   8%|▊         | 40/525 [00:44<08:57,  1.11s/it]Train:   8%|▊         | 40/525 [00:44<08:57,  1.11s/it]Train:   8%|▊         | 41/525 [00:45<08:27,  1.05s/it]Train:   8%|▊         | 42/525 [00:46<08:30,  1.06s/it]Train:   8%|▊         | 43/525 [00:48<09:23,  1.17s/it]Train:   8%|▊         | 44/525 [00:49<09:27,  1.18s/it]Train:   9%|▊         | 45/525 [00:50<09:02,  1.13s/it]Train:   9%|▉         | 46/525 [00:51<09:05,  1.14s/it]Train:   9%|▉         | 47/525 [00:52<08:59,  1.13s/it]Train:   9%|▉         | 48/525 [00:53<09:00,  1.13s/it]Train:   9%|▉         | 49/525 [00:55<09:37,  1.21s/it]Train:  10%|▉         | 50/525 [00:56<09:08,  1.16s/it]                                                       {'loss': 1.23887329, 'token_acc': 0.69400684, 'grad_norm': 2.48695827, 'learning_rate': 1.965e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.702144, 'epoch': 0.1, 'global_step/max_steps': '50/525', 'percentage': '9.52%', 'elapsed_time': '56s', 'remaining_time': '8m 55s'}
+Train:  10%|▉         | 50/525 [00:56<09:08,  1.16s/it]Train:  10%|▉         | 50/525 [00:56<09:08,  1.16s/it]Train:  10%|▉         | 51/525 [00:57<09:29,  1.20s/it]Train:  10%|▉         | 52/525 [00:58<09:24,  1.19s/it]Train:  10%|█         | 53/525 [00:59<09:10,  1.17s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.50it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.09it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.59it/s]                                                                                                         {'eval_loss': 1.24785483, 'eval_token_acc': 0.70641564, 'eval_runtime': 1.2022, 'eval_samples_per_second': 415.899, 'eval_steps_per_second': 4.991, 'epoch': 0.1, 'global_step/max_steps': '53/525', 'percentage': '10.10%', 'elapsed_time': '1m 1s', 'remaining_time': '9m 4s'}
+Train:  10%|█         | 53/525 [01:01<09:10,  1.17s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.59it/s]Train:  10%|█         | 53/525 [01:01<09:10,  1.17s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  8.29it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-53
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  10%|█         | 54/525 [01:18<51:02,  6.50s/it]Train:  10%|█         | 55/525 [01:19<38:04,  4.86s/it]Train:  11%|█         | 56/525 [01:21<29:21,  3.76s/it]Train:  11%|█         | 57/525 [01:22<22:56,  2.94s/it]Train:  11%|█         | 58/525 [01:23<18:39,  2.40s/it]Train:  11%|█         | 59/525 [01:24<15:51,  2.04s/it]Train:  11%|█▏        | 60/525 [01:25<13:20,  1.72s/it]                                                       {'loss': 1.18452091, 'token_acc': 0.71189238, 'grad_norm': 2.41434407, 'learning_rate': 1.947e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.598143, 'epoch': 0.11, 'global_step/max_steps': '60/525', 'percentage': '11.43%', 'elapsed_time': '1m 25s', 'remaining_time': '11m 2s'}
+Train:  11%|█▏        | 60/525 [01:25<13:20,  1.72s/it]Train:  11%|█▏        | 60/525 [01:25<13:20,  1.72s/it]Train:  12%|█▏        | 61/525 [01:26<11:41,  1.51s/it]Train:  12%|█▏        | 62/525 [01:27<10:25,  1.35s/it]Train:  12%|█▏        | 63/525 [01:28<09:58,  1.30s/it]Train:  12%|█▏        | 64/525 [01:29<09:20,  1.22s/it]Train:  12%|█▏        | 65/525 [01:30<08:40,  1.13s/it]Train:  13%|█▎        | 66/525 [01:31<08:22,  1.09s/it]Train:  13%|█▎        | 67/525 [01:32<08:29,  1.11s/it]Train:  13%|█▎        | 68/525 [01:34<08:47,  1.15s/it]Train:  13%|█▎        | 69/525 [01:35<08:47,  1.16s/it]Train:  13%|█▎        | 70/525 [01:36<08:37,  1.14s/it]                                                       {'loss': 1.19660473, 'token_acc': 0.7101556, 'grad_norm': 2.32760048, 'learning_rate': 1.926e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.630002, 'epoch': 0.13, 'global_step/max_steps': '70/525', 'percentage': '13.33%', 'elapsed_time': '1m 36s', 'remaining_time': '10m 25s'}
+Train:  13%|█▎        | 70/525 [01:36<08:37,  1.14s/it]Train:  13%|█▎        | 70/525 [01:36<08:37,  1.14s/it]Train:  14%|█▎        | 71/525 [01:37<08:53,  1.18s/it]Train:  14%|█▎        | 72/525 [01:38<08:33,  1.13s/it]Train:  14%|█▍        | 73/525 [01:39<08:26,  1.12s/it]Train:  14%|█▍        | 74/525 [01:40<08:12,  1.09s/it]Train:  14%|█▍        | 75/525 [01:41<08:15,  1.10s/it]Train:  14%|█▍        | 76/525 [01:42<08:09,  1.09s/it]Train:  15%|█▍        | 77/525 [01:43<08:05,  1.08s/it]Train:  15%|█▍        | 78/525 [01:45<08:20,  1.12s/it]Train:  15%|█▌        | 79/525 [01:46<07:56,  1.07s/it]Train:  15%|█▌        | 80/525 [01:47<07:57,  1.07s/it]                                                       {'loss': 1.14489002, 'token_acc': 0.70331225, 'grad_norm': 1.95450723, 'learning_rate': 1.901e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.655628, 'epoch': 0.15, 'global_step/max_steps': '80/525', 'percentage': '15.24%', 'elapsed_time': '1m 47s', 'remaining_time': '9m 56s'}
+Train:  15%|█▌        | 80/525 [01:47<07:57,  1.07s/it]Train:  15%|█▌        | 80/525 [01:47<07:57,  1.07s/it]Train:  15%|█▌        | 81/525 [01:48<07:48,  1.06s/it]Train:  16%|█▌        | 82/525 [01:49<07:44,  1.05s/it]Train:  16%|█▌        | 83/525 [01:50<08:00,  1.09s/it]Train:  16%|█▌        | 84/525 [01:51<08:00,  1.09s/it]Train:  16%|█▌        | 85/525 [01:52<07:41,  1.05s/it]Train:  16%|█▋        | 86/525 [01:53<08:00,  1.09s/it]Train:  17%|█▋        | 87/525 [01:54<07:53,  1.08s/it]Train:  17%|█▋        | 88/525 [01:55<08:16,  1.14s/it]Train:  17%|█▋        | 89/525 [01:57<08:41,  1.20s/it]Train:  17%|█▋        | 90/525 [01:58<09:20,  1.29s/it]                                                       {'loss': 1.15060062, 'token_acc': 0.71445106, 'grad_norm': 1.89210403, 'learning_rate': 1.873e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.67341, 'epoch': 0.17, 'global_step/max_steps': '90/525', 'percentage': '17.14%', 'elapsed_time': '1m 58s', 'remaining_time': '9m 34s'}
+Train:  17%|█▋        | 90/525 [01:58<09:20,  1.29s/it]Train:  17%|█▋        | 90/525 [01:58<09:20,  1.29s/it]Train:  17%|█▋        | 91/525 [01:59<09:03,  1.25s/it]Train:  18%|█▊        | 92/525 [02:01<08:49,  1.22s/it]Train:  18%|█▊        | 93/525 [02:02<08:35,  1.19s/it]Train:  18%|█▊        | 94/525 [02:03<08:32,  1.19s/it]Train:  18%|█▊        | 95/525 [02:04<08:07,  1.13s/it]Train:  18%|█▊        | 96/525 [02:05<08:17,  1.16s/it]Train:  18%|█▊        | 97/525 [02:06<08:23,  1.18s/it]Train:  19%|█▊        | 98/525 [02:07<08:13,  1.16s/it]Train:  19%|█▉        | 99/525 [02:09<07:59,  1.13s/it]Train:  19%|█▉        | 100/525 [02:10<08:34,  1.21s/it]                                                        {'loss': 1.09808836, 'token_acc': 0.73052664, 'grad_norm': 1.94947791, 'learning_rate': 1.842e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.688296, 'epoch': 0.19, 'global_step/max_steps': '100/525', 'percentage': '19.05%', 'elapsed_time': '2m 10s', 'remaining_time': '9m 14s'}
+Train:  19%|█▉        | 100/525 [02:10<08:34,  1.21s/it]Train:  19%|█▉        | 100/525 [02:10<08:34,  1.21s/it]Train:  19%|█▉        | 101/525 [02:11<08:06,  1.15s/it]Train:  19%|█▉        | 102/525 [02:12<08:08,  1.15s/it]Train:  20%|█▉        | 103/525 [02:13<07:55,  1.13s/it]Train:  20%|█▉        | 104/525 [02:14<07:48,  1.11s/it]Train:  20%|██        | 105/525 [02:15<07:27,  1.07s/it]Train:  20%|██        | 106/525 [02:16<07:25,  1.06s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.26it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.04it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.49it/s]                                                                                                          {'eval_loss': 1.11945701, 'eval_token_acc': 0.73297876, 'eval_runtime': 1.3061, 'eval_samples_per_second': 382.812, 'eval_steps_per_second': 4.594, 'epoch': 0.2, 'global_step/max_steps': '106/525', 'percentage': '20.19%', 'elapsed_time': '2m 18s', 'remaining_time': '9m 5s'}
+Train:  20%|██        | 106/525 [02:18<07:25,  1.06s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.49it/s]Train:  20%|██        | 106/525 [02:18<07:25,  1.06s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  8.21it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-106
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  20%|██        | 107/525 [02:35<44:16,  6.36s/it]Train:  21%|██        | 108/525 [02:36<33:15,  4.79s/it]Train:  21%|██        | 109/525 [02:37<25:21,  3.66s/it]Train:  21%|██        | 110/525 [02:38<20:04,  2.90s/it]                                                        {'loss': 1.1299715, 'token_acc': 0.72152965, 'grad_norm': 2.24327874, 'learning_rate': 1.808e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.633618, 'epoch': 0.21, 'global_step/max_steps': '110/525', 'percentage': '20.95%', 'elapsed_time': '2m 38s', 'remaining_time': '9m 58s'}
+Train:  21%|██        | 110/525 [02:38<20:04,  2.90s/it]Train:  21%|██        | 110/525 [02:38<20:04,  2.90s/it]Train:  21%|██        | 111/525 [02:39<16:21,  2.37s/it]Train:  21%|██▏       | 112/525 [02:40<13:33,  1.97s/it]Train:  22%|██▏       | 113/525 [02:42<11:57,  1.74s/it]Train:  22%|██▏       | 114/525 [02:43<10:49,  1.58s/it]Train:  22%|██▏       | 115/525 [02:44<09:44,  1.43s/it]Train:  22%|██▏       | 116/525 [02:45<08:40,  1.27s/it]Train:  22%|██▏       | 117/525 [02:46<08:03,  1.19s/it]Train:  22%|██▏       | 118/525 [02:47<07:50,  1.16s/it]Train:  23%|██▎       | 119/525 [02:48<07:39,  1.13s/it]Train:  23%|██▎       | 120/525 [02:49<07:34,  1.12s/it]                                                        {'loss': 1.09982128, 'token_acc': 0.73218401, 'grad_norm': 2.19070649, 'learning_rate': 1.771e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.650738, 'epoch': 0.23, 'global_step/max_steps': '120/525', 'percentage': '22.86%', 'elapsed_time': '2m 49s', 'remaining_time': '9m 32s'}
+Train:  23%|██▎       | 120/525 [02:49<07:34,  1.12s/it]Train:  23%|██▎       | 120/525 [02:49<07:34,  1.12s/it]Train:  23%|██▎       | 121/525 [02:50<07:17,  1.08s/it]Train:  23%|██▎       | 122/525 [02:51<07:12,  1.07s/it]Train:  23%|██▎       | 123/525 [02:52<07:36,  1.14s/it]Train:  24%|██▎       | 124/525 [02:54<07:59,  1.20s/it]Train:  24%|██▍       | 125/525 [02:55<07:50,  1.18s/it]Train:  24%|██▍       | 126/525 [02:56<07:52,  1.18s/it]Train:  24%|██▍       | 127/525 [02:57<07:35,  1.15s/it]Train:  24%|██▍       | 128/525 [02:58<07:14,  1.09s/it]Train:  25%|██▍       | 129/525 [02:59<07:20,  1.11s/it]Train:  25%|██▍       | 130/525 [03:00<07:13,  1.10s/it]                                                        {'loss': 1.07520561, 'token_acc': 0.72483707, 'grad_norm': 2.02295065, 'learning_rate': 1.731e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.664451, 'epoch': 0.25, 'global_step/max_steps': '130/525', 'percentage': '24.76%', 'elapsed_time': '3m 0s', 'remaining_time': '9m 9s'}
+Train:  25%|██▍       | 130/525 [03:00<07:13,  1.10s/it]Train:  25%|██▍       | 130/525 [03:00<07:13,  1.10s/it]Train:  25%|██▍       | 131/525 [03:01<07:01,  1.07s/it]Train:  25%|██▌       | 132/525 [03:02<06:52,  1.05s/it]Train:  25%|██▌       | 133/525 [03:03<06:55,  1.06s/it]Train:  26%|██▌       | 134/525 [03:05<07:17,  1.12s/it]Train:  26%|██▌       | 135/525 [03:06<07:09,  1.10s/it]Train:  26%|██▌       | 136/525 [03:07<07:17,  1.12s/it]Train:  26%|██▌       | 137/525 [03:08<07:13,  1.12s/it]Train:  26%|██▋       | 138/525 [03:09<07:20,  1.14s/it]Train:  26%|██▋       | 139/525 [03:10<07:01,  1.09s/it]Train:  27%|██▋       | 140/525 [03:11<07:10,  1.12s/it]                                                        {'loss': 1.07658634, 'token_acc': 0.736679, 'grad_norm': 2.28215909, 'learning_rate': 1.689e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.677362, 'epoch': 0.27, 'global_step/max_steps': '140/525', 'percentage': '26.67%', 'elapsed_time': '3m 11s', 'remaining_time': '8m 47s'}
+Train:  27%|██▋       | 140/525 [03:11<07:10,  1.12s/it]Train:  27%|██▋       | 140/525 [03:11<07:10,  1.12s/it]Train:  27%|██▋       | 141/525 [03:13<07:23,  1.15s/it]Train:  27%|██▋       | 142/525 [03:14<07:13,  1.13s/it]Train:  27%|██▋       | 143/525 [03:15<07:04,  1.11s/it]Train:  27%|██▋       | 144/525 [03:16<07:04,  1.11s/it]Train:  28%|██▊       | 145/525 [03:17<06:58,  1.10s/it]Train:  28%|██▊       | 146/525 [03:18<06:55,  1.10s/it]Train:  28%|██▊       | 147/525 [03:19<07:07,  1.13s/it]Train:  28%|██▊       | 148/525 [03:20<07:13,  1.15s/it]Train:  28%|██▊       | 149/525 [03:21<06:55,  1.10s/it]Train:  29%|██▊       | 150/525 [03:23<06:58,  1.11s/it]                                                        {'loss': 1.04806137, 'token_acc': 0.73847495, 'grad_norm': 1.91798937, 'learning_rate': 1.644e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.68844, 'epoch': 0.29, 'global_step/max_steps': '150/525', 'percentage': '28.57%', 'elapsed_time': '3m 23s', 'remaining_time': '8m 27s'}
+Train:  29%|██▊       | 150/525 [03:23<06:58,  1.11s/it]Train:  29%|██▊       | 150/525 [03:23<06:58,  1.11s/it]Train:  29%|██▉       | 151/525 [03:24<07:01,  1.13s/it]Train:  29%|██▉       | 152/525 [03:25<07:02,  1.13s/it]Train:  29%|██▉       | 153/525 [03:26<06:55,  1.12s/it]Train:  29%|██▉       | 154/525 [03:27<06:53,  1.11s/it]Train:  30%|██▉       | 155/525 [03:28<06:52,  1.11s/it]Train:  30%|██▉       | 156/525 [03:29<06:59,  1.14s/it]Train:  30%|██▉       | 157/525 [03:30<06:52,  1.12s/it]Train:  30%|███       | 158/525 [03:32<06:52,  1.12s/it]Train:  30%|███       | 159/525 [03:33<06:36,  1.08s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.52it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.00it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.52it/s]                                                                                                          {'eval_loss': 1.02398539, 'eval_token_acc': 0.75171497, 'eval_runtime': 1.3036, 'eval_samples_per_second': 383.556, 'eval_steps_per_second': 4.603, 'epoch': 0.3, 'global_step/max_steps': '159/525', 'percentage': '30.29%', 'elapsed_time': '3m 34s', 'remaining_time': '8m 13s'}
+Train:  30%|███       | 159/525 [03:34<06:36,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.52it/s]Train:  30%|███       | 159/525 [03:34<06:36,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  8.24it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-159
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  30%|███       | 160/525 [03:51<38:44,  6.37s/it]                                                        {'loss': 1.03953762, 'token_acc': 0.73360836, 'grad_norm': 2.19595003, 'learning_rate': 1.596e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.64887, 'epoch': 0.3, 'global_step/max_steps': '160/525', 'percentage': '30.48%', 'elapsed_time': '3m 51s', 'remaining_time': '8m 48s'}
+Train:  30%|███       | 160/525 [03:51<38:44,  6.37s/it]Train:  30%|███       | 160/525 [03:51<38:44,  6.37s/it]Train:  31%|███       | 161/525 [03:52<28:57,  4.77s/it]Train:  31%|███       | 162/525 [03:53<22:00,  3.64s/it]Train:  31%|███       | 163/525 [03:54<17:26,  2.89s/it]Train:  31%|███       | 164/525 [03:56<14:13,  2.36s/it]Train:  31%|███▏      | 165/525 [03:57<11:56,  1.99s/it]Train:  32%|███▏      | 166/525 [03:58<10:06,  1.69s/it]Train:  32%|███▏      | 167/525 [03:59<08:53,  1.49s/it]Train:  32%|███▏      | 168/525 [04:00<08:07,  1.36s/it]Train:  32%|███▏      | 169/525 [04:01<07:21,  1.24s/it]Train:  32%|███▏      | 170/525 [04:02<07:02,  1.19s/it]                                                        {'loss': 1.03059788, 'token_acc': 0.72712695, 'grad_norm': 1.79703772, 'learning_rate': 1.546e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.661154, 'epoch': 0.32, 'global_step/max_steps': '170/525', 'percentage': '32.38%', 'elapsed_time': '4m 2s', 'remaining_time': '8m 25s'}
+Train:  32%|███▏      | 170/525 [04:02<07:02,  1.19s/it]Train:  32%|███▏      | 170/525 [04:02<07:02,  1.19s/it]Train:  33%|███▎      | 171/525 [04:03<06:54,  1.17s/it]Train:  33%|███▎      | 172/525 [04:04<06:36,  1.12s/it]Train:  33%|███▎      | 173/525 [04:05<06:27,  1.10s/it]Train:  33%|███▎      | 174/525 [04:06<06:33,  1.12s/it]Train:  33%|███▎      | 175/525 [04:07<06:44,  1.16s/it]Train:  34%|███▎      | 176/525 [04:09<06:45,  1.16s/it]Train:  34%|███▎      | 177/525 [04:10<06:43,  1.16s/it]Train:  34%|███▍      | 178/525 [04:11<06:25,  1.11s/it]Train:  34%|███▍      | 179/525 [04:12<06:26,  1.12s/it]Train:  34%|███▍      | 180/525 [04:13<06:19,  1.10s/it]                                                        {'loss': 1.00962753, 'token_acc': 0.74938875, 'grad_norm': 2.01687217, 'learning_rate': 1.495e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.671041, 'epoch': 0.34, 'global_step/max_steps': '180/525', 'percentage': '34.29%', 'elapsed_time': '4m 13s', 'remaining_time': '8m 5s'}
+Train:  34%|███▍      | 180/525 [04:13<06:19,  1.10s/it]Train:  34%|███▍      | 180/525 [04:13<06:19,  1.10s/it]Train:  34%|███▍      | 181/525 [04:14<06:18,  1.10s/it]Train:  35%|███▍      | 182/525 [04:15<06:21,  1.11s/it]Train:  35%|███▍      | 183/525 [04:16<06:32,  1.15s/it]Train:  35%|███▌      | 184/525 [04:18<06:48,  1.20s/it]Train:  35%|███▌      | 185/525 [04:19<06:54,  1.22s/it]Train:  35%|███▌      | 186/525 [04:20<06:46,  1.20s/it]Train:  36%|███▌      | 187/525 [04:21<06:24,  1.14s/it]Train:  36%|███▌      | 188/525 [04:22<06:34,  1.17s/it]Train:  36%|███▌      | 189/525 [04:24<06:39,  1.19s/it]Train:  36%|███▌      | 190/525 [04:25<06:40,  1.20s/it]                                                        {'loss': 1.01629267, 'token_acc': 0.75722105, 'grad_norm': 2.16808534, 'learning_rate': 1.441e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.678237, 'epoch': 0.36, 'global_step/max_steps': '190/525', 'percentage': '36.19%', 'elapsed_time': '4m 25s', 'remaining_time': '7m 47s'}
+Train:  36%|███▌      | 190/525 [04:25<06:40,  1.20s/it]Train:  36%|███▌      | 190/525 [04:25<06:40,  1.20s/it]Train:  36%|███▋      | 191/525 [04:26<06:30,  1.17s/it]Train:  37%|███▋      | 192/525 [04:27<06:22,  1.15s/it]Train:  37%|███▋      | 193/525 [04:28<06:14,  1.13s/it]Train:  37%|███▋      | 194/525 [04:29<06:19,  1.15s/it]Train:  37%|███▋      | 195/525 [04:30<06:23,  1.16s/it]Train:  37%|███▋      | 196/525 [04:32<06:16,  1.14s/it]Train:  38%|███▊      | 197/525 [04:33<05:57,  1.09s/it]Train:  38%|███▊      | 198/525 [04:34<05:51,  1.07s/it]Train:  38%|███▊      | 199/525 [04:35<05:45,  1.06s/it]Train:  38%|███▊      | 200/525 [04:36<05:50,  1.08s/it]                                                        {'loss': 0.99183617, 'token_acc': 0.76208515, 'grad_norm': 2.11960721, 'learning_rate': 1.386e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.687125, 'epoch': 0.38, 'global_step/max_steps': '200/525', 'percentage': '38.10%', 'elapsed_time': '4m 36s', 'remaining_time': '7m 28s'}
+Train:  38%|███▊      | 200/525 [04:36<05:50,  1.08s/it]Train:  38%|███▊      | 200/525 [04:36<05:50,  1.08s/it]Train:  38%|███▊      | 201/525 [04:37<05:49,  1.08s/it]Train:  38%|███▊      | 202/525 [04:38<06:04,  1.13s/it]Train:  39%|███▊      | 203/525 [04:39<06:03,  1.13s/it]Train:  39%|███▉      | 204/525 [04:40<05:46,  1.08s/it]Train:  39%|███▉      | 205/525 [04:41<05:49,  1.09s/it]Train:  39%|███▉      | 206/525 [04:43<06:09,  1.16s/it]Train:  39%|███▉      | 207/525 [04:44<06:01,  1.14s/it]Train:  40%|███▉      | 208/525 [04:45<05:54,  1.12s/it]Train:  40%|███▉      | 209/525 [04:46<06:00,  1.14s/it]Train:  40%|████      | 210/525 [04:47<05:46,  1.10s/it]                                                        {'loss': 0.97124233, 'token_acc': 0.76954603, 'grad_norm': 2.0679419, 'learning_rate': 1.33e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.694724, 'epoch': 0.4, 'global_step/max_steps': '210/525', 'percentage': '40.00%', 'elapsed_time': '4m 47s', 'remaining_time': '7m 11s'}
+Train:  40%|████      | 210/525 [04:47<05:46,  1.10s/it]Train:  40%|████      | 210/525 [04:47<05:46,  1.10s/it]Train:  40%|████      | 211/525 [04:48<05:37,  1.08s/it]Train:  40%|████      | 212/525 [04:49<05:36,  1.08s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.56it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.08it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.20it/s]                                                                                                          {'eval_loss': 0.94692993, 'eval_token_acc': 0.77075338, 'eval_runtime': 1.3298, 'eval_samples_per_second': 375.993, 'eval_steps_per_second': 4.512, 'epoch': 0.4, 'global_step/max_steps': '212/525', 'percentage': '40.38%', 'elapsed_time': '4m 50s', 'remaining_time': '7m 9s'}
+Train:  40%|████      | 212/525 [04:50<05:36,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.20it/s]Train:  40%|████      | 212/525 [04:50<05:36,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.96it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-212
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  41%|████      | 213/525 [05:08<33:27,  6.44s/it]Train:  41%|████      | 214/525 [05:09<25:16,  4.88s/it]Train:  41%|████      | 215/525 [05:10<19:18,  3.74s/it]Train:  41%|████      | 216/525 [05:11<15:10,  2.95s/it]Train:  41%|████▏     | 217/525 [05:13<12:32,  2.44s/it]Train:  42%|████▏     | 218/525 [05:14<10:34,  2.07s/it]Train:  42%|████▏     | 219/525 [05:15<08:52,  1.74s/it]Train:  42%|████▏     | 220/525 [05:16<07:57,  1.57s/it]                                                        {'loss': 0.98910971, 'token_acc': 0.74121883, 'grad_norm': 2.02478981, 'learning_rate': 1.272e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.663984, 'epoch': 0.42, 'global_step/max_steps': '220/525', 'percentage': '41.90%', 'elapsed_time': '5m 16s', 'remaining_time': '7m 18s'}
+Train:  42%|████▏     | 220/525 [05:16<07:57,  1.57s/it]Train:  42%|████▏     | 220/525 [05:16<07:57,  1.57s/it]Train:  42%|████▏     | 221/525 [05:17<07:13,  1.43s/it]Train:  42%|████▏     | 222/525 [05:18<06:42,  1.33s/it]Train:  42%|████▏     | 223/525 [05:19<06:35,  1.31s/it]Train:  43%|████▎     | 224/525 [05:21<06:17,  1.26s/it]Train:  43%|████▎     | 225/525 [05:22<05:54,  1.18s/it]Train:  43%|████▎     | 226/525 [05:23<05:43,  1.15s/it]Train:  43%|████▎     | 227/525 [05:24<05:36,  1.13s/it]Train:  43%|████▎     | 228/525 [05:25<05:24,  1.09s/it]Train:  44%|████▎     | 229/525 [05:26<05:28,  1.11s/it]Train:  44%|████▍     | 230/525 [05:27<05:31,  1.12s/it]                                                        {'loss': 0.91966782, 'token_acc': 0.75729077, 'grad_norm': 1.87769961, 'learning_rate': 1.213e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.671734, 'epoch': 0.44, 'global_step/max_steps': '230/525', 'percentage': '43.81%', 'elapsed_time': '5m 27s', 'remaining_time': '7m 0s'}
+Train:  44%|████▍     | 230/525 [05:27<05:31,  1.12s/it]Train:  44%|████▍     | 230/525 [05:27<05:31,  1.12s/it]Train:  44%|████▍     | 231/525 [05:28<05:20,  1.09s/it]Train:  44%|████▍     | 232/525 [05:29<05:31,  1.13s/it]Train:  44%|████▍     | 233/525 [05:30<05:27,  1.12s/it]Train:  45%|████▍     | 234/525 [05:32<05:43,  1.18s/it]Train:  45%|████▍     | 235/525 [05:33<05:47,  1.20s/it]Train:  45%|████▍     | 236/525 [05:34<05:31,  1.15s/it]Train:  45%|████▌     | 237/525 [05:35<05:35,  1.17s/it]Train:  45%|████▌     | 238/525 [05:36<05:30,  1.15s/it]Train:  46%|████▌     | 239/525 [05:37<05:17,  1.11s/it]Train:  46%|████▌     | 240/525 [05:38<05:19,  1.12s/it]                                                        {'loss': 0.94313774, 'token_acc': 0.76240007, 'grad_norm': 1.87646425, 'learning_rate': 1.154e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.678325, 'epoch': 0.46, 'global_step/max_steps': '240/525', 'percentage': '45.71%', 'elapsed_time': '5m 38s', 'remaining_time': '6m 42s'}
+Train:  46%|████▌     | 240/525 [05:38<05:19,  1.12s/it]Train:  46%|████▌     | 240/525 [05:38<05:19,  1.12s/it]Train:  46%|████▌     | 241/525 [05:39<05:07,  1.08s/it]Train:  46%|████▌     | 242/525 [05:40<04:58,  1.06s/it]Train:  46%|████▋     | 243/525 [05:42<05:05,  1.08s/it]Train:  46%|████▋     | 244/525 [05:43<04:57,  1.06s/it]Train:  47%|████▋     | 245/525 [05:44<05:02,  1.08s/it]Train:  47%|████▋     | 246/525 [05:45<04:57,  1.07s/it]Train:  47%|████▋     | 247/525 [05:46<04:57,  1.07s/it]Train:  47%|████▋     | 248/525 [05:47<04:52,  1.06s/it]Train:  47%|████▋     | 249/525 [05:48<04:51,  1.06s/it]Train:  48%|████▊     | 250/525 [05:49<05:04,  1.11s/it]                                                        {'loss': 0.92292213, 'token_acc': 0.75403881, 'grad_norm': 2.03938174, 'learning_rate': 1.094e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.685879, 'epoch': 0.48, 'global_step/max_steps': '250/525', 'percentage': '47.62%', 'elapsed_time': '5m 49s', 'remaining_time': '6m 24s'}
+Train:  48%|████▊     | 250/525 [05:49<05:04,  1.11s/it]Train:  48%|████▊     | 250/525 [05:49<05:04,  1.11s/it]Train:  48%|████▊     | 251/525 [05:50<05:04,  1.11s/it]Train:  48%|████▊     | 252/525 [05:51<05:12,  1.14s/it]Train:  48%|████▊     | 253/525 [05:53<05:04,  1.12s/it]Train:  48%|████▊     | 254/525 [05:54<04:51,  1.07s/it]Train:  49%|████▊     | 255/525 [05:55<04:55,  1.09s/it]Train:  49%|████▉     | 256/525 [05:56<05:00,  1.12s/it]Train:  49%|████▉     | 257/525 [05:57<04:46,  1.07s/it]Train:  49%|████▉     | 258/525 [05:58<04:57,  1.11s/it]Train:  49%|████▉     | 259/525 [05:59<04:56,  1.11s/it]Train:  50%|████▉     | 260/525 [06:01<05:20,  1.21s/it]                                                        {'loss': 0.92472334, 'token_acc': 0.75957636, 'grad_norm': 2.17814589, 'learning_rate': 1.033e-05, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.691673, 'epoch': 0.5, 'global_step/max_steps': '260/525', 'percentage': '49.52%', 'elapsed_time': '6m 1s', 'remaining_time': '6m 7s'}
+Train:  50%|████▉     | 260/525 [06:01<05:20,  1.21s/it]Train:  50%|████▉     | 260/525 [06:01<05:20,  1.21s/it]Train:  50%|████▉     | 261/525 [06:02<05:10,  1.18s/it]Train:  50%|████▉     | 262/525 [06:03<04:58,  1.14s/it]Train:  50%|█████     | 263/525 [06:04<04:55,  1.13s/it]Train:  50%|█████     | 264/525 [06:05<04:50,  1.11s/it]Train:  50%|█████     | 265/525 [06:06<04:41,  1.08s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.48it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.08it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.56it/s]                                                                                                          {'eval_loss': 0.87573212, 'eval_token_acc': 0.78395938, 'eval_runtime': 1.4004, 'eval_samples_per_second': 357.045, 'eval_steps_per_second': 4.285, 'epoch': 0.5, 'global_step/max_steps': '265/525', 'percentage': '50.48%', 'elapsed_time': '6m 7s', 'remaining_time': '6m 0s'}
+Train:  50%|█████     | 265/525 [06:07<04:41,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  6.56it/s]Train:  50%|█████     | 265/525 [06:07<04:41,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.39it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-265
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  51%|█████     | 266/525 [06:25<28:26,  6.59s/it]Train:  51%|█████     | 267/525 [06:26<21:10,  4.92s/it]Train:  51%|█████     | 268/525 [06:28<16:16,  3.80s/it]Train:  51%|█████     | 269/525 [06:29<12:56,  3.03s/it]Train:  51%|█████▏    | 270/525 [06:30<10:22,  2.44s/it]                                                        {'loss': 0.90501995, 'token_acc': 0.76283467, 'grad_norm': 1.98729563, 'learning_rate': 9.73e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.666339, 'epoch': 0.51, 'global_step/max_steps': '270/525', 'percentage': '51.43%', 'elapsed_time': '6m 30s', 'remaining_time': '6m 8s'}
+Train:  51%|█████▏    | 270/525 [06:30<10:22,  2.44s/it]Train:  51%|█████▏    | 270/525 [06:30<10:22,  2.44s/it]Train:  52%|█████▏    | 271/525 [06:31<08:41,  2.05s/it]Train:  52%|█████▏    | 272/525 [06:32<07:38,  1.81s/it]Train:  52%|█████▏    | 273/525 [06:33<06:36,  1.57s/it]Train:  52%|█████▏    | 274/525 [06:34<06:04,  1.45s/it]Train:  52%|█████▏    | 275/525 [06:36<05:43,  1.37s/it]Train:  53%|█████▎    | 276/525 [06:37<05:47,  1.40s/it]Train:  53%|█████▎    | 277/525 [06:38<05:25,  1.31s/it]Train:  53%|█████▎    | 278/525 [06:39<05:01,  1.22s/it]Train:  53%|█████▎    | 279/525 [06:40<04:44,  1.15s/it]Train:  53%|█████▎    | 280/525 [06:41<04:40,  1.14s/it]                                                        {'loss': 0.91353035, 'token_acc': 0.77443247, 'grad_norm': 1.94012487, 'learning_rate': 9.12e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.672002, 'epoch': 0.53, 'global_step/max_steps': '280/525', 'percentage': '53.33%', 'elapsed_time': '6m 41s', 'remaining_time': '5m 51s'}
+Train:  53%|█████▎    | 280/525 [06:41<04:40,  1.14s/it]Train:  53%|█████▎    | 280/525 [06:41<04:40,  1.14s/it]Train:  54%|█████▎    | 281/525 [06:43<04:44,  1.17s/it]Train:  54%|█████▎    | 282/525 [06:44<04:40,  1.16s/it]Train:  54%|█████▍    | 283/525 [06:45<04:37,  1.15s/it]Train:  54%|█████▍    | 284/525 [06:46<04:29,  1.12s/it]Train:  54%|█████▍    | 285/525 [06:47<04:24,  1.10s/it]Train:  54%|█████▍    | 286/525 [06:48<04:28,  1.12s/it]Train:  55%|█████▍    | 287/525 [06:49<04:22,  1.10s/it]Train:  55%|█████▍    | 288/525 [06:50<04:22,  1.11s/it]Train:  55%|█████▌    | 289/525 [06:51<04:24,  1.12s/it]Train:  55%|█████▌    | 290/525 [06:53<04:27,  1.14s/it]                                                        {'loss': 0.90376043, 'token_acc': 0.7643521, 'grad_norm': 1.95544481, 'learning_rate': 8.52e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.677674, 'epoch': 0.55, 'global_step/max_steps': '290/525', 'percentage': '55.24%', 'elapsed_time': '6m 53s', 'remaining_time': '5m 34s'}
+Train:  55%|█████▌    | 290/525 [06:53<04:27,  1.14s/it]Train:  55%|█████▌    | 290/525 [06:53<04:27,  1.14s/it]Train:  55%|█████▌    | 291/525 [06:54<04:22,  1.12s/it]Train:  56%|█████▌    | 292/525 [06:55<04:32,  1.17s/it]Train:  56%|█████▌    | 293/525 [06:56<04:30,  1.17s/it]Train:  56%|█████▌    | 294/525 [06:57<04:25,  1.15s/it]Train:  56%|█████▌    | 295/525 [06:58<04:19,  1.13s/it]Train:  56%|█████▋    | 296/525 [06:59<04:21,  1.14s/it]Train:  57%|█████▋    | 297/525 [07:01<04:18,  1.13s/it]Train:  57%|█████▋    | 298/525 [07:02<04:15,  1.13s/it]Train:  57%|█████▋    | 299/525 [07:03<04:17,  1.14s/it]Train:  57%|█████▋    | 300/525 [07:04<04:11,  1.12s/it]                                                        {'loss': 0.91521149, 'token_acc': 0.7686918, 'grad_norm': 2.04080081, 'learning_rate': 7.93e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.682942, 'epoch': 0.57, 'global_step/max_steps': '300/525', 'percentage': '57.14%', 'elapsed_time': '7m 4s', 'remaining_time': '5m 18s'}
+Train:  57%|█████▋    | 300/525 [07:04<04:11,  1.12s/it]Train:  57%|█████▋    | 300/525 [07:04<04:11,  1.12s/it]Train:  57%|█████▋    | 301/525 [07:05<04:07,  1.10s/it]Train:  58%|█████▊    | 302/525 [07:06<04:02,  1.09s/it]Train:  58%|█████▊    | 303/525 [07:07<03:52,  1.05s/it]Train:  58%|█████▊    | 304/525 [07:08<03:49,  1.04s/it]Train:  58%|█████▊    | 305/525 [07:09<03:47,  1.04s/it]Train:  58%|█████▊    | 306/525 [07:10<04:00,  1.10s/it]Train:  58%|█████▊    | 307/525 [07:11<04:00,  1.10s/it]Train:  59%|█████▊    | 308/525 [07:13<04:01,  1.11s/it]Train:  59%|█████▉    | 309/525 [07:14<03:59,  1.11s/it]Train:  59%|█████▉    | 310/525 [07:15<03:51,  1.08s/it]                                                        {'loss': 0.86946774, 'token_acc': 0.77709611, 'grad_norm': 2.01810718, 'learning_rate': 7.34e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.68891, 'epoch': 0.59, 'global_step/max_steps': '310/525', 'percentage': '59.05%', 'elapsed_time': '7m 15s', 'remaining_time': '5m 1s'}
+Train:  59%|█████▉    | 310/525 [07:15<03:51,  1.08s/it]Train:  59%|█████▉    | 310/525 [07:15<03:51,  1.08s/it]Train:  59%|█████▉    | 311/525 [07:16<03:45,  1.06s/it]Train:  59%|█████▉    | 312/525 [07:17<03:49,  1.08s/it]Train:  60%|█████▉    | 313/525 [07:18<03:53,  1.10s/it]Train:  60%|█████▉    | 314/525 [07:19<03:47,  1.08s/it]Train:  60%|██████    | 315/525 [07:20<03:47,  1.08s/it]Train:  60%|██████    | 316/525 [07:21<03:44,  1.08s/it]Train:  60%|██████    | 317/525 [07:22<03:41,  1.07s/it]Train:  61%|██████    | 318/525 [07:23<03:39,  1.06s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.50it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.09it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.16it/s]                                                                                                          {'eval_loss': 0.81832546, 'eval_token_acc': 0.79616814, 'eval_runtime': 1.3416, 'eval_samples_per_second': 372.676, 'eval_steps_per_second': 4.472, 'epoch': 0.61, 'global_step/max_steps': '318/525', 'percentage': '60.57%', 'elapsed_time': '7m 25s', 'remaining_time': '4m 49s'}
+Train:  61%|██████    | 318/525 [07:25<03:39,  1.06s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.16it/s]Train:  61%|██████    | 318/525 [07:25<03:39,  1.06s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.94it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-318
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  61%|██████    | 319/525 [07:42<22:11,  6.46s/it]Train:  61%|██████    | 320/525 [07:43<16:30,  4.83s/it]                                                        {'loss': 0.88454618, 'token_acc': 0.76034612, 'grad_norm': 2.0867815, 'learning_rate': 6.76e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.668554, 'epoch': 0.61, 'global_step/max_steps': '320/525', 'percentage': '60.95%', 'elapsed_time': '7m 43s', 'remaining_time': '4m 57s'}
+Train:  61%|██████    | 320/525 [07:43<16:30,  4.83s/it]Train:  61%|██████    | 320/525 [07:43<16:30,  4.83s/it]Train:  61%|██████    | 321/525 [07:44<12:38,  3.72s/it]Train:  61%|██████▏   | 322/525 [07:45<09:51,  2.91s/it]Train:  62%|██████▏   | 323/525 [07:47<08:10,  2.43s/it]Train:  62%|██████▏   | 324/525 [07:48<06:42,  2.00s/it]Train:  62%|██████▏   | 325/525 [07:49<05:42,  1.71s/it]Train:  62%|██████▏   | 326/525 [07:50<04:57,  1.50s/it]Train:  62%|██████▏   | 327/525 [07:51<04:37,  1.40s/it]Train:  62%|██████▏   | 328/525 [07:52<04:16,  1.30s/it]Train:  63%|██████▎   | 329/525 [07:53<04:01,  1.23s/it]Train:  63%|██████▎   | 330/525 [07:54<03:53,  1.20s/it]                                                        {'loss': 0.87842607, 'token_acc': 0.76381523, 'grad_norm': 2.01089287, 'learning_rate': 6.19e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.674077, 'epoch': 0.63, 'global_step/max_steps': '330/525', 'percentage': '62.86%', 'elapsed_time': '7m 54s', 'remaining_time': '4m 40s'}
+Train:  63%|██████▎   | 330/525 [07:54<03:53,  1.20s/it]Train:  63%|██████▎   | 330/525 [07:54<03:53,  1.20s/it]Train:  63%|██████▎   | 331/525 [07:55<03:50,  1.19s/it]Train:  63%|██████▎   | 332/525 [07:57<03:48,  1.18s/it]Train:  63%|██████▎   | 333/525 [07:58<03:43,  1.16s/it]Train:  64%|██████▎   | 334/525 [07:59<03:44,  1.18s/it]Train:  64%|██████▍   | 335/525 [08:00<03:42,  1.17s/it]Train:  64%|██████▍   | 336/525 [08:01<03:38,  1.16s/it]Train:  64%|██████▍   | 337/525 [08:02<03:26,  1.10s/it]Train:  64%|██████▍   | 338/525 [08:03<03:21,  1.08s/it]Train:  65%|██████▍   | 339/525 [08:04<03:26,  1.11s/it]Train:  65%|██████▍   | 340/525 [08:05<03:21,  1.09s/it]                                                        {'loss': 0.86033983, 'token_acc': 0.78780178, 'grad_norm': 2.11734009, 'learning_rate': 5.64e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.67901, 'epoch': 0.65, 'global_step/max_steps': '340/525', 'percentage': '64.76%', 'elapsed_time': '8m 5s', 'remaining_time': '4m 24s'}
+Train:  65%|██████▍   | 340/525 [08:05<03:21,  1.09s/it]Train:  65%|██████▍   | 340/525 [08:05<03:21,  1.09s/it]Train:  65%|██████▍   | 341/525 [08:06<03:15,  1.07s/it]Train:  65%|██████▌   | 342/525 [08:07<03:14,  1.06s/it]Train:  65%|██████▌   | 343/525 [08:09<03:22,  1.11s/it]Train:  66%|██████▌   | 344/525 [08:10<03:15,  1.08s/it]Train:  66%|██████▌   | 345/525 [08:11<03:17,  1.10s/it]Train:  66%|██████▌   | 346/525 [08:12<03:17,  1.10s/it]Train:  66%|██████▌   | 347/525 [08:13<03:11,  1.08s/it]Train:  66%|██████▋   | 348/525 [08:14<03:02,  1.03s/it]Train:  66%|██████▋   | 349/525 [08:15<03:09,  1.08s/it]Train:  67%|██████▋   | 350/525 [08:16<03:06,  1.07s/it]                                                        {'loss': 0.88362503, 'token_acc': 0.77859882, 'grad_norm': 2.01630521, 'learning_rate': 5.11e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.684334, 'epoch': 0.67, 'global_step/max_steps': '350/525', 'percentage': '66.67%', 'elapsed_time': '8m 16s', 'remaining_time': '4m 8s'}
+Train:  67%|██████▋   | 350/525 [08:16<03:06,  1.07s/it]Train:  67%|██████▋   | 350/525 [08:16<03:06,  1.07s/it]Train:  67%|██████▋   | 351/525 [08:17<03:06,  1.07s/it]Train:  67%|██████▋   | 352/525 [08:18<03:05,  1.07s/it]Train:  67%|██████▋   | 353/525 [08:19<03:13,  1.12s/it]Train:  67%|██████▋   | 354/525 [08:21<03:07,  1.09s/it]Train:  68%|██████▊   | 355/525 [08:22<03:01,  1.07s/it]Train:  68%|██████▊   | 356/525 [08:23<03:04,  1.09s/it]Train:  68%|██████▊   | 357/525 [08:24<03:13,  1.15s/it]Train:  68%|██████▊   | 358/525 [08:25<03:13,  1.16s/it]Train:  68%|██████▊   | 359/525 [08:26<03:09,  1.14s/it]Train:  69%|██████▊   | 360/525 [08:27<03:02,  1.11s/it]                                                        {'loss': 0.85622959, 'token_acc': 0.76745421, 'grad_norm': 2.1268034, 'learning_rate': 4.59e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.68884, 'epoch': 0.69, 'global_step/max_steps': '360/525', 'percentage': '68.57%', 'elapsed_time': '8m 27s', 'remaining_time': '3m 52s'}
+Train:  69%|██████▊   | 360/525 [08:27<03:02,  1.11s/it]Train:  69%|██████▊   | 360/525 [08:27<03:02,  1.11s/it]Train:  69%|██████▉   | 361/525 [08:29<03:09,  1.16s/it]Train:  69%|██████▉   | 362/525 [08:30<03:06,  1.14s/it]Train:  69%|██████▉   | 363/525 [08:31<02:59,  1.11s/it]Train:  69%|██████▉   | 364/525 [08:32<02:53,  1.08s/it]Train:  70%|██████▉   | 365/525 [08:33<02:57,  1.11s/it]Train:  70%|██████▉   | 366/525 [08:34<03:02,  1.15s/it]Train:  70%|██████▉   | 367/525 [08:35<02:55,  1.11s/it]Train:  70%|███████   | 368/525 [08:36<02:50,  1.08s/it]Train:  70%|███████   | 369/525 [08:37<02:45,  1.06s/it]Train:  70%|███████   | 370/525 [08:38<02:46,  1.08s/it]                                                        {'loss': 0.87056913, 'token_acc': 0.7632952, 'grad_norm': 1.82094967, 'learning_rate': 4.09e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.69337, 'epoch': 0.7, 'global_step/max_steps': '370/525', 'percentage': '70.48%', 'elapsed_time': '8m 38s', 'remaining_time': '3m 37s'}
+Train:  70%|███████   | 370/525 [08:38<02:46,  1.08s/it]Train:  70%|███████   | 370/525 [08:38<02:46,  1.08s/it]Train:  71%|███████   | 371/525 [08:39<02:44,  1.07s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.59it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.10it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.87it/s]                                                                                                          {'eval_loss': 0.78008133, 'eval_token_acc': 0.80490163, 'eval_runtime': 1.3661, 'eval_samples_per_second': 366.006, 'eval_steps_per_second': 4.392, 'epoch': 0.71, 'global_step/max_steps': '371/525', 'percentage': '70.67%', 'elapsed_time': '8m 41s', 'remaining_time': '3m 36s'}
+Train:  71%|███████   | 371/525 [08:41<02:44,  1.07s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  6.87it/s]Train:  71%|███████   | 371/525 [08:41<02:44,  1.07s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.68it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-371
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  71%|███████   | 372/525 [08:58<16:30,  6.47s/it]Train:  71%|███████   | 373/525 [09:00<12:21,  4.88s/it]Train:  71%|███████   | 374/525 [09:01<09:27,  3.76s/it]Train:  71%|███████▏  | 375/525 [09:02<07:25,  2.97s/it]Train:  72%|███████▏  | 376/525 [09:03<05:54,  2.38s/it]Train:  72%|███████▏  | 377/525 [09:04<04:55,  1.99s/it]Train:  72%|███████▏  | 378/525 [09:05<04:09,  1.70s/it]Train:  72%|███████▏  | 379/525 [09:06<03:45,  1.54s/it]Train:  72%|███████▏  | 380/525 [09:07<03:22,  1.40s/it]                                                        {'loss': 0.85663776, 'token_acc': 0.76751564, 'grad_norm': 1.99219036, 'learning_rate': 3.61e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.675513, 'epoch': 0.72, 'global_step/max_steps': '380/525', 'percentage': '72.38%', 'elapsed_time': '9m 7s', 'remaining_time': '3m 28s'}
+Train:  72%|███████▏  | 380/525 [09:07<03:22,  1.40s/it]Train:  72%|███████▏  | 380/525 [09:07<03:22,  1.40s/it]Train:  73%|███████▎  | 381/525 [09:08<03:12,  1.33s/it]Train:  73%|███████▎  | 382/525 [09:09<02:58,  1.25s/it]Train:  73%|███████▎  | 383/525 [09:11<02:51,  1.21s/it]Train:  73%|███████▎  | 384/525 [09:12<02:46,  1.18s/it]Train:  73%|███████▎  | 385/525 [09:13<02:34,  1.11s/it]Train:  74%|███████▎  | 386/525 [09:14<02:34,  1.11s/it]Train:  74%|███████▎  | 387/525 [09:15<02:38,  1.15s/it]Train:  74%|███████▍  | 388/525 [09:16<02:36,  1.14s/it]Train:  74%|███████▍  | 389/525 [09:17<02:34,  1.14s/it]Train:  74%|███████▍  | 390/525 [09:18<02:26,  1.09s/it]                                                        {'loss': 0.84692335, 'token_acc': 0.79125184, 'grad_norm': 2.1994772, 'learning_rate': 3.16e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.680018, 'epoch': 0.74, 'global_step/max_steps': '390/525', 'percentage': '74.29%', 'elapsed_time': '9m 18s', 'remaining_time': '3m 13s'}
+Train:  74%|███████▍  | 390/525 [09:18<02:26,  1.09s/it]Train:  74%|███████▍  | 390/525 [09:18<02:26,  1.09s/it]Train:  74%|███████▍  | 391/525 [09:19<02:22,  1.07s/it]Train:  75%|███████▍  | 392/525 [09:20<02:31,  1.14s/it]Train:  75%|███████▍  | 393/525 [09:22<02:30,  1.14s/it]Train:  75%|███████▌  | 394/525 [09:23<02:23,  1.09s/it]Train:  75%|███████▌  | 395/525 [09:24<02:21,  1.09s/it]Train:  75%|███████▌  | 396/525 [09:25<02:22,  1.11s/it]Train:  76%|███████▌  | 397/525 [09:26<02:24,  1.13s/it]Train:  76%|███████▌  | 398/525 [09:27<02:19,  1.10s/it]Train:  76%|███████▌  | 399/525 [09:28<02:20,  1.11s/it]Train:  76%|███████▌  | 400/525 [09:29<02:20,  1.13s/it]                                                        {'loss': 0.83600502, 'token_acc': 0.7907637, 'grad_norm': 1.71462512, 'learning_rate': 2.73e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.684101, 'epoch': 0.76, 'global_step/max_steps': '400/525', 'percentage': '76.19%', 'elapsed_time': '9m 29s', 'remaining_time': '2m 58s'}
+Train:  76%|███████▌  | 400/525 [09:29<02:20,  1.13s/it]Train:  76%|███████▌  | 400/525 [09:29<02:20,  1.13s/it]Train:  76%|███████▋  | 401/525 [09:30<02:20,  1.13s/it]Train:  77%|███████▋  | 402/525 [09:31<02:13,  1.09s/it]Train:  77%|███████▋  | 403/525 [09:32<02:07,  1.05s/it]Train:  77%|███████▋  | 404/525 [09:34<02:12,  1.09s/it]Train:  77%|███████▋  | 405/525 [09:35<02:04,  1.04s/it]Train:  77%|███████▋  | 406/525 [09:36<02:02,  1.03s/it]Train:  78%|███████▊  | 407/525 [09:37<02:03,  1.05s/it]Train:  78%|███████▊  | 408/525 [09:38<02:06,  1.08s/it]Train:  78%|███████▊  | 409/525 [09:39<02:07,  1.10s/it]Train:  78%|███████▊  | 410/525 [09:40<02:05,  1.09s/it]                                                        {'loss': 0.85053272, 'token_acc': 0.78034516, 'grad_norm': 1.96433842, 'learning_rate': 2.33e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.688647, 'epoch': 0.78, 'global_step/max_steps': '410/525', 'percentage': '78.10%', 'elapsed_time': '9m 40s', 'remaining_time': '2m 42s'}
+Train:  78%|███████▊  | 410/525 [09:40<02:05,  1.09s/it]Train:  78%|███████▊  | 410/525 [09:40<02:05,  1.09s/it]Train:  78%|███████▊  | 411/525 [09:41<02:07,  1.12s/it]Train:  78%|███████▊  | 412/525 [09:42<02:06,  1.12s/it]Train:  79%|███████▊  | 413/525 [09:43<02:03,  1.10s/it]Train:  79%|███████▉  | 414/525 [09:44<02:01,  1.09s/it]Train:  79%|███████▉  | 415/525 [09:46<02:00,  1.09s/it]Train:  79%|███████▉  | 416/525 [09:47<01:58,  1.09s/it]Train:  79%|███████▉  | 417/525 [09:48<01:57,  1.08s/it]Train:  80%|███████▉  | 418/525 [09:49<01:59,  1.12s/it]Train:  80%|███████▉  | 419/525 [09:50<02:06,  1.19s/it]Train:  80%|████████  | 420/525 [09:51<01:58,  1.12s/it]                                                        {'loss': 0.82590809, 'token_acc': 0.77511563, 'grad_norm': 2.12512207, 'learning_rate': 1.95e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.692413, 'epoch': 0.8, 'global_step/max_steps': '420/525', 'percentage': '80.00%', 'elapsed_time': '9m 51s', 'remaining_time': '2m 27s'}
+Train:  80%|████████  | 420/525 [09:51<01:58,  1.12s/it]Train:  80%|████████  | 420/525 [09:51<01:58,  1.12s/it]Train:  80%|████████  | 421/525 [09:52<01:57,  1.13s/it]Train:  80%|████████  | 422/525 [09:54<01:57,  1.14s/it]Train:  81%|████████  | 423/525 [09:55<01:53,  1.11s/it]Train:  81%|████████  | 424/525 [09:56<01:48,  1.08s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.50it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.01it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.30it/s]                                                                                                          {'eval_loss': 0.75265592, 'eval_token_acc': 0.8119126, 'eval_runtime': 1.3415, 'eval_samples_per_second': 372.727, 'eval_steps_per_second': 4.473, 'epoch': 0.81, 'global_step/max_steps': '424/525', 'percentage': '80.76%', 'elapsed_time': '9m 57s', 'remaining_time': '2m 22s'}
+Train:  81%|████████  | 424/525 [09:57<01:48,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.30it/s]Train:  81%|████████  | 424/525 [09:57<01:48,  1.08s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  8.05it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-424
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  81%|████████  | 425/525 [10:15<11:07,  6.68s/it]Train:  81%|████████  | 426/525 [10:17<08:20,  5.05s/it]Train:  81%|████████▏ | 427/525 [10:18<06:17,  3.86s/it]Train:  82%|████████▏ | 428/525 [10:19<04:53,  3.02s/it]Train:  82%|████████▏ | 429/525 [10:20<03:55,  2.45s/it]Train:  82%|████████▏ | 430/525 [10:21<03:13,  2.04s/it]                                                        {'loss': 0.84262543, 'token_acc': 0.77875936, 'grad_norm': 1.97538006, 'learning_rate': 1.61e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.675831, 'epoch': 0.82, 'global_step/max_steps': '430/525', 'percentage': '81.90%', 'elapsed_time': '10m 21s', 'remaining_time': '2m 17s'}
+Train:  82%|████████▏ | 430/525 [10:21<03:13,  2.04s/it]Train:  82%|████████▏ | 430/525 [10:21<03:13,  2.04s/it]Train:  82%|████████▏ | 431/525 [10:22<02:40,  1.71s/it]Train:  82%|████████▏ | 432/525 [10:23<02:23,  1.55s/it]Train:  82%|████████▏ | 433/525 [10:24<02:10,  1.42s/it]Train:  83%|████████▎ | 434/525 [10:25<01:56,  1.28s/it]Train:  83%|████████▎ | 435/525 [10:26<01:51,  1.24s/it]Train:  83%|████████▎ | 436/525 [10:27<01:47,  1.21s/it]Train:  83%|████████▎ | 437/525 [10:29<01:45,  1.20s/it]Train:  83%|████████▎ | 438/525 [10:30<01:43,  1.19s/it]Train:  84%|████████▎ | 439/525 [10:31<01:39,  1.16s/it]Train:  84%|████████▍ | 440/525 [10:32<01:36,  1.14s/it]                                                        {'loss': 0.82615795, 'token_acc': 0.77460043, 'grad_norm': 1.98043156, 'learning_rate': 1.29e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.679781, 'epoch': 0.84, 'global_step/max_steps': '440/525', 'percentage': '83.81%', 'elapsed_time': '10m 32s', 'remaining_time': '2m 2s'}
+Train:  84%|████████▍ | 440/525 [10:32<01:36,  1.14s/it]Train:  84%|████████▍ | 440/525 [10:32<01:36,  1.14s/it]Train:  84%|████████▍ | 441/525 [10:33<01:33,  1.11s/it]Train:  84%|████████▍ | 442/525 [10:34<01:35,  1.14s/it]Train:  84%|████████▍ | 443/525 [10:35<01:30,  1.11s/it]Train:  85%|████████▍ | 444/525 [10:36<01:27,  1.08s/it]Train:  85%|████████▍ | 445/525 [10:37<01:25,  1.07s/it]Train:  85%|████████▍ | 446/525 [10:38<01:26,  1.10s/it]Train:  85%|████████▌ | 447/525 [10:39<01:20,  1.04s/it]Train:  85%|████████▌ | 448/525 [10:40<01:19,  1.03s/it]Train:  86%|████████▌ | 449/525 [10:42<01:21,  1.08s/it]Train:  86%|████████▌ | 450/525 [10:43<01:22,  1.10s/it]                                                        {'loss': 0.83664207, 'token_acc': 0.79628217, 'grad_norm': 1.86360788, 'learning_rate': 1.01e-06, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.683863, 'epoch': 0.86, 'global_step/max_steps': '450/525', 'percentage': '85.71%', 'elapsed_time': '10m 43s', 'remaining_time': '1m 47s'}
+Train:  86%|████████▌ | 450/525 [10:43<01:22,  1.10s/it]Train:  86%|████████▌ | 450/525 [10:43<01:22,  1.10s/it]Train:  86%|████████▌ | 451/525 [10:44<01:18,  1.06s/it]Train:  86%|████████▌ | 452/525 [10:45<01:20,  1.10s/it]Train:  86%|████████▋ | 453/525 [10:46<01:17,  1.08s/it]Train:  86%|████████▋ | 454/525 [10:47<01:25,  1.21s/it]Train:  87%|████████▋ | 455/525 [10:48<01:22,  1.18s/it]Train:  87%|████████▋ | 456/525 [10:49<01:17,  1.13s/it]Train:  87%|████████▋ | 457/525 [10:51<01:14,  1.10s/it]Train:  87%|████████▋ | 458/525 [10:51<01:09,  1.04s/it]Train:  87%|████████▋ | 459/525 [10:53<01:10,  1.07s/it]Train:  88%|████████▊ | 460/525 [10:54<01:07,  1.04s/it]                                                        {'loss': 0.82346592, 'token_acc': 0.78506883, 'grad_norm': 1.84809279, 'learning_rate': 7.6e-07, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.687735, 'epoch': 0.88, 'global_step/max_steps': '460/525', 'percentage': '87.62%', 'elapsed_time': '10m 54s', 'remaining_time': '1m 32s'}
+Train:  88%|████████▊ | 460/525 [10:54<01:07,  1.04s/it]Train:  88%|████████▊ | 460/525 [10:54<01:07,  1.04s/it]Train:  88%|████████▊ | 461/525 [10:55<01:07,  1.06s/it]Train:  88%|████████▊ | 462/525 [10:56<01:11,  1.13s/it]Train:  88%|████████▊ | 463/525 [10:57<01:11,  1.15s/it]Train:  88%|████████▊ | 464/525 [10:58<01:11,  1.18s/it]Train:  89%|████████▊ | 465/525 [10:59<01:08,  1.14s/it]Train:  89%|████████▉ | 466/525 [11:01<01:07,  1.14s/it]Train:  89%|████████▉ | 467/525 [11:02<01:06,  1.14s/it]Train:  89%|████████▉ | 468/525 [11:03<01:04,  1.13s/it]Train:  89%|████████▉ | 469/525 [11:04<01:02,  1.11s/it]Train:  90%|████████▉ | 470/525 [11:05<00:59,  1.09s/it]                                                        {'loss': 0.81804752, 'token_acc': 0.79058844, 'grad_norm': 1.93893504, 'learning_rate': 5.5e-07, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.690923, 'epoch': 0.9, 'global_step/max_steps': '470/525', 'percentage': '89.52%', 'elapsed_time': '11m 5s', 'remaining_time': '1m 17s'}
+Train:  90%|████████▉ | 470/525 [11:05<00:59,  1.09s/it]Train:  90%|████████▉ | 470/525 [11:05<00:59,  1.09s/it]Train:  90%|████████▉ | 471/525 [11:06<00:57,  1.07s/it]Train:  90%|████████▉ | 472/525 [11:07<00:59,  1.12s/it]Train:  90%|█████████ | 473/525 [11:08<00:55,  1.08s/it]Train:  90%|█████████ | 474/525 [11:09<00:53,  1.05s/it]Train:  90%|█████████ | 475/525 [11:10<00:53,  1.07s/it]Train:  91%|█████████ | 476/525 [11:11<00:53,  1.09s/it]Train:  91%|█████████ | 477/525 [11:12<00:50,  1.05s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.63it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.08it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.07it/s]                                                                                                          {'eval_loss': 0.74242377, 'eval_token_acc': 0.81472304, 'eval_runtime': 1.5617, 'eval_samples_per_second': 320.166, 'eval_steps_per_second': 3.842, 'epoch': 0.91, 'global_step/max_steps': '477/525', 'percentage': '90.86%', 'elapsed_time': '11m 14s', 'remaining_time': '1m 7s'}
+Train:  91%|█████████ | 477/525 [11:14<00:50,  1.05s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.07it/s]Train:  91%|█████████ | 477/525 [11:14<00:50,  1.05s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.85it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-477
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  91%|█████████ | 478/525 [11:31<05:02,  6.44s/it]Train:  91%|█████████ | 479/525 [11:32<03:40,  4.80s/it]Train:  91%|█████████▏| 480/525 [11:33<02:44,  3.66s/it]                                                        {'loss': 0.81797428, 'token_acc': 0.78304666, 'grad_norm': 1.86061883, 'learning_rate': 3.7e-07, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.677322, 'epoch': 0.91, 'global_step/max_steps': '480/525', 'percentage': '91.43%', 'elapsed_time': '11m 33s', 'remaining_time': '1m 5s'}
+Train:  91%|█████████▏| 480/525 [11:33<02:44,  3.66s/it]Train:  91%|█████████▏| 480/525 [11:33<02:44,  3.66s/it]Train:  92%|█████████▏| 481/525 [11:34<02:06,  2.87s/it]Train:  92%|█████████▏| 482/525 [11:35<01:40,  2.34s/it]Train:  92%|█████████▏| 483/525 [11:37<01:23,  1.98s/it]Train:  92%|█████████▏| 484/525 [11:38<01:10,  1.72s/it]Train:  92%|█████████▏| 485/525 [11:39<01:03,  1.60s/it]Train:  93%|█████████▎| 486/525 [11:40<00:54,  1.41s/it]Train:  93%|█████████▎| 487/525 [11:41<00:51,  1.36s/it]Train:  93%|█████████▎| 488/525 [11:42<00:46,  1.26s/it]Train:  93%|█████████▎| 489/525 [11:43<00:42,  1.19s/it]Train:  93%|█████████▎| 490/525 [11:44<00:39,  1.14s/it]                                                        {'loss': 0.85071688, 'token_acc': 0.77893883, 'grad_norm': 2.28605318, 'learning_rate': 2.2e-07, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.680869, 'epoch': 0.93, 'global_step/max_steps': '490/525', 'percentage': '93.33%', 'elapsed_time': '11m 44s', 'remaining_time': '50s'}
+Train:  93%|█████████▎| 490/525 [11:44<00:39,  1.14s/it]Train:  93%|█████████▎| 490/525 [11:44<00:39,  1.14s/it]Train:  94%|█████████▎| 491/525 [11:45<00:38,  1.13s/it]Train:  94%|█████████▎| 492/525 [11:46<00:35,  1.07s/it]Train:  94%|█████████▍| 493/525 [11:47<00:34,  1.07s/it]Train:  94%|█████████▍| 494/525 [11:49<00:35,  1.16s/it]Train:  94%|█████████▍| 495/525 [11:50<00:33,  1.12s/it]Train:  94%|█████████▍| 496/525 [11:51<00:32,  1.12s/it]Train:  95%|█████████▍| 497/525 [11:52<00:30,  1.09s/it]Train:  95%|█████████▍| 498/525 [11:53<00:29,  1.11s/it]Train:  95%|█████████▌| 499/525 [11:54<00:28,  1.08s/it]Train:  95%|█████████▌| 500/525 [11:55<00:26,  1.08s/it]                                                        {'loss': 0.81089058, 'token_acc': 0.79508076, 'grad_norm': 1.79387105, 'learning_rate': 1.1e-07, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.684417, 'epoch': 0.95, 'global_step/max_steps': '500/525', 'percentage': '95.24%', 'elapsed_time': '11m 55s', 'remaining_time': '35s'}
+Train:  95%|█████████▌| 500/525 [11:55<00:26,  1.08s/it]Train:  95%|█████████▌| 500/525 [11:55<00:26,  1.08s/it]Train:  95%|█████████▌| 501/525 [11:56<00:25,  1.06s/it]Train:  96%|█████████▌| 502/525 [11:57<00:24,  1.06s/it]Train:  96%|█████████▌| 503/525 [11:58<00:23,  1.05s/it]Train:  96%|█████████▌| 504/525 [11:59<00:22,  1.07s/it]Train:  96%|█████████▌| 505/525 [12:00<00:20,  1.04s/it]Train:  96%|█████████▋| 506/525 [12:02<00:20,  1.07s/it]Train:  97%|█████████▋| 507/525 [12:03<00:19,  1.07s/it]Train:  97%|█████████▋| 508/525 [12:04<00:18,  1.07s/it]Train:  97%|█████████▋| 509/525 [12:05<00:16,  1.05s/it]Train:  97%|█████████▋| 510/525 [12:06<00:15,  1.07s/it]                                                        {'loss': 0.81196289, 'token_acc': 0.79439987, 'grad_norm': 1.80153871, 'learning_rate': 4e-08, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.688134, 'epoch': 0.97, 'global_step/max_steps': '510/525', 'percentage': '97.14%', 'elapsed_time': '12m 6s', 'remaining_time': '21s'}
+Train:  97%|█████████▋| 510/525 [12:06<00:15,  1.07s/it]Train:  97%|█████████▋| 510/525 [12:06<00:15,  1.07s/it]Train:  97%|█████████▋| 511/525 [12:07<00:14,  1.03s/it]Train:  98%|█████████▊| 512/525 [12:08<00:13,  1.05s/it]Train:  98%|█████████▊| 513/525 [12:09<00:12,  1.07s/it]Train:  98%|█████████▊| 514/525 [12:10<00:11,  1.07s/it]Train:  98%|█████████▊| 515/525 [12:11<00:10,  1.07s/it]Train:  98%|█████████▊| 516/525 [12:12<00:09,  1.02s/it]Train:  98%|█████████▊| 517/525 [12:13<00:08,  1.03s/it]Train:  99%|█████████▊| 518/525 [12:14<00:07,  1.09s/it]Train:  99%|█████████▉| 519/525 [12:15<00:06,  1.09s/it]Train:  99%|█████████▉| 520/525 [12:17<00:05,  1.15s/it]                                                        {'loss': 0.81841869, 'token_acc': 0.79345839, 'grad_norm': 1.86123455, 'learning_rate': 0.0, 'memory(GiB)': 70.83, 'train_speed(iter/s)': 0.691507, 'epoch': 0.99, 'global_step/max_steps': '520/525', 'percentage': '99.05%', 'elapsed_time': '12m 17s', 'remaining_time': '7s'}
+Train:  99%|█████████▉| 520/525 [12:17<00:05,  1.15s/it]Train:  99%|█████████▉| 520/525 [12:17<00:05,  1.15s/it]Train:  99%|█████████▉| 521/525 [12:18<00:04,  1.11s/it]Train:  99%|█████████▉| 522/525 [12:19<00:03,  1.07s/it]Train: 100%|█████████▉| 523/525 [12:20<00:02,  1.12s/it]Train: 100%|█████████▉| 524/525 [12:21<00:01,  1.06s/it]Train: 100%|██████████| 525/525 [12:22<00:00,  1.07s/it]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 17.58it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00, 10.08it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.81it/s]                                                                                                          {'eval_loss': 0.74077374, 'eval_token_acc': 0.81448128, 'eval_runtime': 1.5731, 'eval_samples_per_second': 317.836, 'eval_steps_per_second': 3.814, 'epoch': 1.0, 'global_step/max_steps': '525/525', 'percentage': '100.00%', 'elapsed_time': '12m 23s', 'remaining_time': '0s'}
+Train: 100%|██████████| 525/525 [12:23<00:00,  1.07s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  6.81it/s]Train: 100%|██████████| 525/525 [12:23<00:00,  1.07s/it]Val: 100%|██████████| 6/6 [00:00<00:00,  7.61it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/checkpoint-525
+                                                        {'train_runtime': 760.7789, 'train_samples_per_second': 132.485, 'train_steps_per_second': 0.69, 'train_loss': 0.98217008, 'epoch': 1.0, 'global_step/max_steps': '525/525', 'percentage': '100.00%', 'elapsed_time': '12m 40s', 'remaining_time': '0s'}
+Train: 100%|██████████| 525/525 [12:40<00:00,  1.07s/it]Train: 100%|██████████| 525/525 [12:40<00:00,  1.07s/it]Train: 100%|██████████| 525/525 [12:40<00:00,  1.45s/it]
+[INFO:swift] last_model_checkpoint: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/last
+[INFO:swift] best_model_checkpoint: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] images_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/images
+[INFO:swift] End time of running main: 2025-09-15 15:49:37.754880
++ bash inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=1
++ NPROC_PER_NODE=1
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
++ comet_model=/mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ swift infer --port 9897 --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 8 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
+[2025-09-15 15:50:09,924] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 1 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --port 9897 --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 8 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`
+[2025-09-15 15:50:15,185] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 1, local_world_size: 1
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py", line 5, in <module>
+[rank0]:     infer_main()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/infer/infer.py", line 291, in infer_main
+[rank0]:     return SwiftInfer(args).main()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/infer/infer.py", line 24, in __init__
+[rank0]:     super().__init__(args)
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 19, in __init__
+[rank0]:     self.args = self._parse_args(args)
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 36, in _parse_args
+[rank0]:     raise ValueError(f'remaining_argv: {remaining_argv}')
+[rank0]: ValueError: remaining_argv: ['--port', '9897']
+E0915 15:50:17.095000 132195510662656 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 1215863) of binary: /mnt/nvme1/luoyingfeng/h200_ms/bin/python
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-15_15:50:17
+  host      : localhost
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 1215863)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+jq: error: Could not open file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl: No such file or directory
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=1
++ NPROC_PER_NODE=1
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
++ comet_model=/mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ rm -rf /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 8 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+[2025-09-15 17:43:27,742] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 1 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 8 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`
+[2025-09-15 17:43:32,947] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 1, local_world_size: 1
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=8, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Patch tp_plan.
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'auto'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 17:43:37.881049
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py", line 5, in <module>
+[rank0]:     infer_main()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/infer/infer.py", line 291, in infer_main
+[rank0]:     return SwiftInfer(args).main()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank0]:     result = self.run()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/infer/infer.py", line 91, in run
+[rank0]:     result = self.infer_dataset()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/infer/infer.py", line 214, in infer_dataset
+[rank0]:     val_dataset = self._prepare_val_dataset()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/infer/infer.py", line 179, in _prepare_val_dataset
+[rank0]:     _, val_dataset = load_dataset(
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank0]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank0]:     dataset = DatasetLoader._load_repo_dataset(
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank0]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank0]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test.en2zh.jsonl`. os.path.exists(dataset_id): False
+E0915 17:43:39.320000 136047222146560 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 1226647) of binary: /mnt/nvme1/luoyingfeng/h200_ms/bin/python
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-15_17:43:39
+  host      : localhost
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 1226647)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+jq: error: Could not open file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl: No such file or directory
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=1
++ NPROC_PER_NODE=1
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
++ comet_model=/mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ rm -rf /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 8 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
+[2025-09-15 17:45:47,994] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 1 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 8 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`
+[2025-09-15 17:45:53,177] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 1, local_world_size: 1
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=8, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Patch tp_plan.
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'auto'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 17:45:58.081119
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 197737.62 examples/s]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5918.53 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/1000 [00:00<?, ?it/s][A
+  1%|          | 8/1000 [00:04<09:22,  1.76it/s][A
+  2%|▏         | 16/1000 [00:07<07:35,  2.16it/s][A
+  2%|▏         | 24/1000 [00:12<08:17,  1.96it/s][A
+  3%|▎         | 32/1000 [00:14<07:08,  2.26it/s][A
+  4%|▍         | 40/1000 [00:17<06:05,  2.62it/s][A
+  5%|▍         | 48/1000 [00:19<05:36,  2.83it/s][A
+  6%|▌         | 56/1000 [00:23<06:33,  2.40it/s][A
+  6%|▋         | 64/1000 [00:27<06:29,  2.40it/s][A
+  7%|▋         | 72/1000 [00:31<06:54,  2.24it/s][A
+  8%|▊         | 80/1000 [00:34<06:26,  2.38it/s][A
+  9%|▉         | 88/1000 [00:36<05:53,  2.58it/s][A
+ 10%|▉         | 96/1000 [00:38<04:58,  3.03it/s][A
+ 10%|█         | 104/1000 [00:41<05:24,  2.76it/s][A
+ 11%|█         | 112/1000 [00:45<05:50,  2.54it/s][A
+ 12%|█▏        | 120/1000 [00:47<05:11,  2.82it/s][A
+ 13%|█▎        | 128/1000 [00:51<05:40,  2.56it/s][A
+ 14%|█▎        | 136/1000 [00:53<04:54,  2.94it/s][A
+ 14%|█▍        | 144/1000 [00:55<04:31,  3.16it/s][A
+ 15%|█▌        | 152/1000 [00:58<04:47,  2.95it/s][A
+ 16%|█▌        | 160/1000 [01:00<04:40,  3.00it/s][A
+ 17%|█▋        | 168/1000 [01:06<05:56,  2.33it/s][A
+ 18%|█▊        | 176/1000 [01:08<05:15,  2.61it/s][A
+ 18%|█▊        | 184/1000 [01:11<05:17,  2.57it/s][A
+ 19%|█▉        | 192/1000 [01:15<05:31,  2.44it/s][A
+ 20%|██        | 200/1000 [01:18<05:38,  2.36it/s][A
+ 21%|██        | 208/1000 [01:22<05:31,  2.39it/s][A
+ 22%|██▏       | 216/1000 [01:24<05:04,  2.57it/s][A
+ 22%|██▏       | 224/1000 [01:26<04:34,  2.82it/s][A
+ 23%|██▎       | 232/1000 [01:28<04:08,  3.09it/s][A
+ 24%|██▍       | 240/1000 [01:32<04:25,  2.87it/s][A
+ 25%|██▍       | 248/1000 [01:35<04:48,  2.61it/s][A
+ 26%|██▌       | 256/1000 [01:37<04:11,  2.96it/s][A
+ 26%|██▋       | 264/1000 [01:40<03:57,  3.10it/s][A
+ 27%|██▋       | 272/1000 [01:43<04:21,  2.78it/s][A
+ 28%|██▊       | 280/1000 [01:49<05:45,  2.08it/s][A
+ 29%|██▉       | 288/1000 [01:51<04:45,  2.50it/s][A
+ 30%|██▉       | 296/1000 [01:54<04:28,  2.62it/s][A
+ 30%|███       | 304/1000 [01:56<04:03,  2.85it/s][A
+ 31%|███       | 312/1000 [01:58<03:48,  3.01it/s][A
+ 32%|███▏      | 320/1000 [02:00<03:19,  3.41it/s][A
+ 33%|███▎      | 328/1000 [02:03<03:30,  3.20it/s][A
+ 34%|███▎      | 336/1000 [02:05<03:28,  3.18it/s][A
+ 34%|███▍      | 344/1000 [02:09<04:09,  2.63it/s][A
+ 35%|███▌      | 352/1000 [02:11<03:36,  3.00it/s][A
+ 36%|███▌      | 360/1000 [02:13<03:14,  3.30it/s][A
+ 37%|███▋      | 368/1000 [02:15<03:09,  3.34it/s][A
+ 38%|███▊      | 376/1000 [02:18<03:17,  3.16it/s][A
+ 38%|███▊      | 384/1000 [02:21<03:07,  3.28it/s][A
+ 39%|███▉      | 392/1000 [02:23<03:06,  3.25it/s][A
+ 40%|████      | 400/1000 [02:26<03:16,  3.05it/s][A
+ 41%|████      | 408/1000 [02:29<03:28,  2.84it/s][A
+ 42%|████▏     | 416/1000 [02:32<03:30,  2.78it/s][A
+ 42%|████▏     | 424/1000 [02:34<03:11,  3.01it/s][A
+ 43%|████▎     | 432/1000 [02:38<03:22,  2.81it/s][A
+ 44%|████▍     | 440/1000 [02:40<02:58,  3.14it/s][A
+ 45%|████▍     | 448/1000 [02:42<02:45,  3.34it/s][A
+ 46%|████▌     | 456/1000 [02:45<02:56,  3.08it/s][A
+ 46%|████▋     | 464/1000 [02:47<02:47,  3.20it/s][A
+ 47%|████▋     | 472/1000 [02:52<03:35,  2.45it/s][A
+ 48%|████▊     | 480/1000 [02:55<03:24,  2.54it/s][A
+ 49%|████▉     | 488/1000 [02:58<03:17,  2.59it/s][A
+ 50%|████▉     | 496/1000 [03:01<03:08,  2.67it/s][A
+ 50%|█████     | 504/1000 [03:03<02:48,  2.95it/s][A
+ 51%|█████     | 512/1000 [03:06<02:48,  2.90it/s][A
+ 52%|█████▏    | 520/1000 [03:07<02:21,  3.40it/s][A
+ 53%|█████▎    | 528/1000 [03:10<02:23,  3.29it/s][A
+ 54%|█████▎    | 536/1000 [03:12<02:12,  3.51it/s][A
+ 54%|█████▍    | 544/1000 [03:14<02:05,  3.62it/s][A
+ 55%|█████▌    | 552/1000 [03:18<02:41,  2.78it/s][A
+ 56%|█████▌    | 560/1000 [03:21<02:37,  2.79it/s][A
+ 57%|█████▋    | 568/1000 [03:24<02:34,  2.79it/s][A
+ 58%|█████▊    | 576/1000 [03:27<02:46,  2.55it/s][A
+ 58%|█████▊    | 584/1000 [03:29<02:20,  2.96it/s][A
+ 59%|█████▉    | 592/1000 [03:31<02:09,  3.15it/s][A
+ 60%|██████    | 600/1000 [03:33<01:56,  3.42it/s][A
+ 61%|██████    | 608/1000 [03:35<01:49,  3.59it/s][A
+ 62%|██████▏   | 616/1000 [03:37<01:42,  3.75it/s][A
+ 62%|██████▏   | 624/1000 [03:39<01:36,  3.90it/s][A
+ 63%|██████▎   | 632/1000 [03:41<01:34,  3.91it/s][A
+ 64%|██████▍   | 640/1000 [03:43<01:38,  3.67it/s][A
+ 65%|██████▍   | 648/1000 [03:45<01:25,  4.11it/s][A
+ 66%|██████▌   | 656/1000 [03:47<01:21,  4.21it/s][A
+ 66%|██████▋   | 664/1000 [03:49<01:22,  4.05it/s][A
+ 67%|██████▋   | 672/1000 [03:51<01:23,  3.91it/s][A
+ 68%|██████▊   | 680/1000 [03:53<01:20,  3.99it/s][A
+ 69%|██████▉   | 688/1000 [03:56<01:28,  3.54it/s][A
+ 70%|██████▉   | 696/1000 [03:59<01:30,  3.34it/s][A
+ 70%|███████   | 704/1000 [04:01<01:29,  3.31it/s][A
+ 71%|███████   | 712/1000 [04:04<01:31,  3.14it/s][A
+ 72%|███████▏  | 720/1000 [04:06<01:26,  3.22it/s][A
+ 73%|███████▎  | 728/1000 [04:09<01:24,  3.21it/s][A
+ 74%|███████▎  | 736/1000 [04:11<01:21,  3.26it/s][A
+ 74%|███████▍  | 744/1000 [04:14<01:24,  3.02it/s][A
+ 75%|███████▌  | 752/1000 [04:16<01:18,  3.16it/s][A
+ 76%|███████▌  | 760/1000 [04:18<01:10,  3.41it/s][A
+ 77%|███████▋  | 768/1000 [04:20<01:05,  3.52it/s][A
+ 78%|███████▊  | 776/1000 [04:23<01:03,  3.55it/s][A
+ 78%|███████▊  | 784/1000 [04:25<01:02,  3.47it/s][A
+ 79%|███████▉  | 792/1000 [04:28<01:03,  3.27it/s][A
+ 80%|████████  | 800/1000 [04:30<00:58,  3.45it/s][A
+ 81%|████████  | 808/1000 [04:32<00:51,  3.76it/s][A
+ 82%|████████▏ | 816/1000 [04:34<00:51,  3.55it/s][A
+ 82%|████████▏ | 824/1000 [04:36<00:46,  3.80it/s][A
+ 83%|████████▎ | 832/1000 [04:39<00:53,  3.15it/s][A
+ 84%|████████▍ | 840/1000 [04:41<00:48,  3.33it/s][A
+ 85%|████████▍ | 848/1000 [04:44<00:46,  3.28it/s][A
+ 86%|████████��� | 856/1000 [04:46<00:43,  3.35it/s][A
+ 86%|████████▋ | 864/1000 [04:49<00:40,  3.38it/s][A
+ 87%|████████▋ | 872/1000 [04:51<00:38,  3.32it/s][A
+ 88%|████████▊ | 880/1000 [04:53<00:35,  3.39it/s][A
+ 89%|████████▉ | 888/1000 [04:55<00:30,  3.65it/s][A
+ 90%|████████▉ | 896/1000 [04:57<00:26,  3.92it/s][A
+ 90%|█████████ | 904/1000 [05:00<00:27,  3.49it/s][A
+ 91%|█████████ | 912/1000 [05:02<00:25,  3.50it/s][A
+ 92%|█████████▏| 920/1000 [05:04<00:22,  3.50it/s][A
+ 93%|█████████▎| 928/1000 [05:06<00:20,  3.58it/s][A
+ 94%|█████████▎| 936/1000 [05:09<00:18,  3.54it/s][A
+ 94%|█████████▍| 944/1000 [05:11<00:16,  3.43it/s][A
+ 95%|█████████▌| 952/1000 [05:14<00:14,  3.21it/s][A
+ 96%|█████████▌| 960/1000 [05:17<00:12,  3.19it/s][A
+ 97%|█████████▋| 968/1000 [05:20<00:10,  3.02it/s][A
+ 98%|█████████▊| 976/1000 [05:22<00:07,  3.14it/s][A
+ 98%|█████████▊| 984/1000 [05:24<00:04,  3.39it/s][A
+ 99%|█████████▉| 992/1000 [05:26<00:02,  3.46it/s][A
+100%|██████████| 1000/1000 [05:28<00:00,  3.54it/s][A100%|██████████| 1000/1000 [05:28<00:00,  3.04it/s]
+ 99%|█████████▉| 1000/1012 [05:28<00:03,  3.04it/s]
+  0%|          | 0/12 [00:00<?, ?it/s][A
+ 67%|██████▋   | 8/12 [00:02<00:01,  3.80it/s][A
+100%|██████████| 12/12 [00:03<00:00,  3.50it/s][A100%|██████████| 12/12 [00:03<00:00,  3.56it/s]
+100%|██████████| 1012/1012 [05:32<00:00,  3.05it/s]100%|██████████| 1012/1012 [05:32<00:00,  3.05it/s]
+[rank0] {'num_prompt_tokens': 63020, 'num_generated_tokens': 27421, 'num_samples': 1012, 'runtime': 332.23795900680125, 'samples/s': 3.0460095620178165, 'tokens/s': 82.53421758902229}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 17:51:31.112513
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt
++ lang_pair_strs=en2zh
+inference.sh: line 75: src_: unbound variable
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=1
++ NPROC_PER_NODE=1
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
++ comet_model=/mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
+[2025-09-15 17:53:11,734] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 1 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`
+[2025-09-15 17:53:16,933] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 1, local_world_size: 1
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Patch tp_plan.
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'auto'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 17:53:21.882285
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/1000 [00:00<?, ?it/s][A
+  2%|▏         | 16/1000 [00:05<05:10,  3.17it/s][A
+  3%|▎         | 32/1000 [00:10<05:19,  3.03it/s][A
+  5%|▍         | 48/1000 [00:13<04:06,  3.87it/s][A
+  6%|▋         | 64/1000 [00:18<04:18,  3.62it/s][A
+  8%|���         | 80/1000 [00:22<04:12,  3.64it/s][A
+ 10%|▉         | 96/1000 [00:25<03:41,  4.08it/s][A
+ 11%|█         | 112/1000 [00:30<03:51,  3.84it/s][A
+ 13%|█▎        | 128/1000 [00:34<03:51,  3.77it/s][A
+ 14%|█▍        | 144/1000 [00:37<03:18,  4.32it/s][A
+ 16%|█▌        | 160/1000 [00:40<03:13,  4.34it/s][A
+ 18%|█▊        | 176/1000 [00:46<03:48,  3.61it/s][A
+ 19%|█▉        | 192/1000 [00:51<03:41,  3.65it/s][A
+ 21%|██        | 208/1000 [00:55<03:34,  3.69it/s][A
+ 22%|██▏       | 224/1000 [00:58<03:10,  4.08it/s][A
+ 24%|██▍       | 240/1000 [01:02<03:05,  4.09it/s][A
+ 26%|██▌       | 256/1000 [01:06<03:04,  4.04it/s][A
+ 27%|██▋       | 272/1000 [01:10<03:01,  4.00it/s][A
+ 29%|██▉       | 288/1000 [01:17<03:45,  3.16it/s][AW0915 17:54:40.662000 131955434169856 torch/distributed/elastic/agent/server/api.py:688] Received Signals.SIGTERM death signal, shutting down workers
+W0915 17:54:40.663000 131955434169856 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1231447 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
+    result = self._invoke_run(role)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 835, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 79, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 1231369 got signal: 15
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+jq: error: Could not open file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl: No such file or directory
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
++ comet_model=/mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ rm -rf /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
+[2025-09-15 17:55:59,730] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 17:56:06,394] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 17:56:06,746] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 17:56:06,800] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[2025-09-15 17:56:07,233] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[2025-09-15 17:56:07,251] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 17:56:07,259] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 17:56:07,273] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 17:56:07,280] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.30s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.32s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.33s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 17:56:11.704955
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.40it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.70it/s][A
+
+ 13%|█▎        | 16/125 [00:02<00:19,  5.62it/s][A 13%|█▎        | 16/125 [00:02<00:19,  5.63it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.22it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.03it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.54it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.11it/s][A
+ 26%|██▌       | 32/125 [00:04<00:12,  7.35it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.08it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.83it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.56it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.38it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.87it/s][A
+ 38%|███▊      | 48/125 [00:06<00:10,  7.49it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.50it/s][A
+ 38%|███▊      | 48/125 [00:08<00:12,  5.98it/s][A
+ 38%|███▊      | 48/125 [00:08<00:12,  6.06it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.62it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.77it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  5.11it/s][A
+ 51%|█████     | 64/125 [00:09<00:08,  6.78it/s][A
+ 26%|██▌       | 32/125 [00:10<00:31,  2.96it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.39it/s][A
+ 38%|███▊      | 48/125 [00:10<00:19,  4.02it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.63it/s][A
+ 64%|██████▍   | 80/125 [00:11<00:06,  6.50it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:06,  6.44it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  5.03it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.89it/s][A
+ 51%|█████     | 64/125 [00:12<00:13,  4.69it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.67it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.52it/s][A
+ 77%|███████▋  | 96/125 [00:14<00:04,  6.28it/s][A
+ 51%|█████     | 64/125 [00:14<00:12,  4.78it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.36it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  6.30it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.14it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.74it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:10,  4.49it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:01,  6.62it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:02,  6.06it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.44it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:08,  5.09it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.38it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.26it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  5.97it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.16it/s][A
+100%|██████████| 125/125 [00:19<00:00,  6.58it/s][A100%|██████████| 125/125 [00:19<00:00,  6.46it/s]
+
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.95it/s][A
+100%|██████████| 125/125 [00:20<00:00,  5.72it/s][A100%|██████████| 125/125 [00:20<00:00,  6.21it/s]
+
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.88it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:05,  4.93it/s][A
+100%|██████████| 125/125 [00:21<00:00,  5.84it/s][A100%|██████████| 125/125 [00:21<00:00,  5.89it/s]
+
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.58it/s][A
+100%|██████████| 125/125 [00:22<00:00,  6.17it/s][A100%|██████████| 125/125 [00:22<00:00,  5.49it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.92it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.38it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  5.35it/s][A
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.63it/s][A
+100%|██████████| 125/125 [00:26<00:00,  4.48it/s][A100%|██████████| 125/125 [00:26<00:00,  4.76it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  5.23it/s][A100%|██████████| 125/125 [00:26<00:00,  4.73it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.61it/s][A100%|██████████| 125/125 [00:26<00:00,  4.70it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.37it/s][A100%|██████████| 125/125 [00:28<00:00,  4.41it/s]
+ 99%|█████████▉| 1000/1012 [00:28<00:00, 35.14it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.13it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.13it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.15it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.13it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.13it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.13it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:28<00:00, 34.85it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.40it/s][A100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.11it/s][A100%|██████████| 1/1 [00:00<00:00,  2.11it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.89it/s][A100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.59it/s][A100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A100%|██████████| 2/2 [00:01<00:00,  1.93it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.43it/s][A100%|██████████| 2/2 [00:01<00:00,  1.43it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.35it/s][A100%|██████████| 2/2 [00:01<00:00,  1.35it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.21it/s][A100%|██████████| 2/2 [00:01<00:00,  1.21it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 32.67it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.69it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.66it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.67it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.67it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.66it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.36it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 32.66it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.38it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 33.36it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.36it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.37it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.36it/s]
+[rank6] {'num_prompt_tokens': 7602, 'num_generated_tokens': 3264, 'num_samples': 126, 'runtime': 30.334505895152688, 'samples/s': 4.153685589457194, 'tokens/s': 107.6002362221292}
+
+
+
+100%|██████████| 1012/1012 [00:30<00:00, 32.75it/s][rank1] {'num_prompt_tokens': 8644, 'num_generated_tokens': 3668, 'num_samples': 127, 'runtime': 30.316879883408546, 'samples/s': 4.1890854365096795, 'tokens/s': 120.98870378832679}
+100%|██████████| 1012/1012 [00:30<00:00, 33.36it/s]
+[rank4] {'num_prompt_tokens': 8327, 'num_generated_tokens': 3232, 'num_samples': 126, 'runtime': 30.337702928110957, 'samples/s': 4.153247867795826, 'tokens/s': 106.53410403742943}
+[rank2] {'num_prompt_tokens': 9022, 'num_generated_tokens': 3542, 'num_samples': 127, 'runtime': 30.33174271695316, 'samples/s': 4.187032746028686, 'tokens/s': 116.77535422388668}[rank5] {'num_prompt_tokens': 8274, 'num_generated_tokens': 3164, 'num_samples': 126, 'runtime': 30.338826464489102, 'samples/s': 4.15309406075677, 'tokens/s': 104.28880641455889}
+
+[rank7] {'num_prompt_tokens': 7844, 'num_generated_tokens': 3083, 'num_samples': 126, 'runtime': 30.33921553567052, 'samples/s': 4.153040801330505, 'tokens/s': 101.61765706747578}
+100%|██████████| 1012/1012 [00:30<00:00, 33.34it/s]
+[rank3] {'num_prompt_tokens': 8911, 'num_generated_tokens': 3550, 'num_samples': 127, 'runtime': 30.338653963059187, 'samples/s': 4.186078926066963, 'tokens/s': 117.01244242155683}
+[rank0] {'num_prompt_tokens': 9270, 'num_generated_tokens': 3854, 'num_samples': 127, 'runtime': 30.352336963638663, 'samples/s': 4.1841918186445675, 'tokens/s': 126.97539581933988}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 17:56:45.664803
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt
++ lang_pair_strs=en2zh
+inference.sh: line 75: src_: unbound variable
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
++ comet_model=/mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+[2025-09-15 18:05:57,538] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:06:04,302] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:06:04,720] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:06:04,810] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:06:04,849] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:06:04,923] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:06:04,972] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:06:04,984] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:06:04,987] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.38s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.06s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:06:09.109185
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.32it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.08it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.74it/s][A
+ 13%|█▎        | 16/125 [00:02<00:20,  5.43it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.23it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.95it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.82it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.09it/s][A
+ 26%|██▌       | 32/125 [00:04<00:13,  7.09it/s][A
+ 26%|██▌       | 32/125 [00:04<00:14,  6.51it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.84it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.78it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.52it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.01it/s][A
+ 38%|███▊      | 48/125 [00:06<00:10,  7.23it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.39it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.50it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.53it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.69it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.22it/s][A
+ 51%|█████     | 64/125 [00:09<00:08,  6.83it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.55it/s][A
+ 26%|██▌       | 32/125 [00:09<00:29,  3.10it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.76it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.68it/s][A
+ 38%|███▊      | 48/125 [00:11<00:19,  4.00it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  5.05it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  4.05it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  6.29it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.12it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  6.41it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  6.02it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.50it/s][A
+ 51%|█████     | 64/125 [00:14<00:12,  4.93it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.39it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  6.07it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  6.26it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  6.09it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.23it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:09,  4.85it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:08,  5.24it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:01,  6.57it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.59it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.43it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:02,  6.34it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  5.86it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.35it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.40it/s][A
+ 90%|████���███▉ | 112/125 [00:19<00:02,  6.37it/s][A
+100%|██████████| 125/125 [00:19<00:00,  6.53it/s][A100%|██████████| 125/125 [00:19<00:00,  6.43it/s]
+
+100%|██████████| 125/125 [00:19<00:00,  6.20it/s][A100%|██████████| 125/125 [00:19<00:00,  6.27it/s]
+
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.93it/s][A
+100%|██████████| 125/125 [00:20<00:00,  5.52it/s][A100%|██████████| 125/125 [00:20<00:00,  5.99it/s]
+
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.94it/s][A
+100%|██████████| 125/125 [00:21<00:00,  6.70it/s][A100%|██████████| 125/125 [00:21<00:00,  5.94it/s]
+
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.68it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.45it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  5.46it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.90it/s][A
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.64it/s][A
+100%|██████████| 125/125 [00:25<00:00,  5.33it/s][A100%|██████████| 125/125 [00:25<00:00,  4.85it/s]
+
+100%|██████████| 125/125 [00:25<00:00,  4.55it/s][A100%|██████████| 125/125 [00:25<00:00,  4.84it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.59it/s][A100%|██████████| 125/125 [00:26<00:00,  4.67it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.38it/s][A100%|██████████| 125/125 [00:28<00:00,  4.43it/s]
+ 99%|█████████▉| 1000/1012 [00:28<00:00, 35.30it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.31it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.33it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.30it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.30it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.32it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.32it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:28<00:00, 35.02it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.61it/s][A100%|██████████| 1/1 [00:00<00:00,  2.61it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.35it/s][A100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.80it/s][A100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.57it/s][A100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A100%|██████████| 2/2 [00:01<00:00,  1.96it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.46it/s][A100%|██████████| 2/2 [00:01<00:00,  1.46it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.36it/s][A100%|██████████| 2/2 [00:01<00:00,  1.36it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.21it/s][A100%|██████████| 2/2 [00:01<00:00,  1.21it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 32.84it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.83it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.82it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.82it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.54it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.83it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 33.53it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.52it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 33.52it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.82it/s]
+
+100%|██████████| 1012/1012 [00:30<00:00, 32.82it/s][rank1] {'num_prompt_tokens': 8644, 'num_generated_tokens': 3668, 'num_samples': 127, 'runtime': 30.17341610044241, 'samples/s': 4.20900303688643, 'tokens/s': 121.56396172676712}[rank7] {'num_prompt_tokens': 7844, 'num_generated_tokens': 3083, 'num_samples': 126, 'runtime': 30.184400089085102, 'samples/s': 4.174341700617814, 'tokens/s': 102.13885288098984}
+
+[rank6] {'num_prompt_tokens': 7602, 'num_generated_tokens': 3264, 'num_samples': 126, 'runtime': 30.19432684779167, 'samples/s': 4.172969334112355, 'tokens/s': 108.09977703605337}
+100%|██████████| 1012/1012 [00:30<00:00, 33.53it/s]
+[rank2] {'num_prompt_tokens': 9022, 'num_generated_tokens': 3542, 'num_samples': 127, 'runtime': 30.192890809848905, 'samples/s': 4.206288188826645, 'tokens/s': 117.31238397499193}
+100%|██████████| 1012/1012 [00:30<00:00, 33.52it/s][rank5] {'num_prompt_tokens': 8274, 'num_generated_tokens': 3164, 'num_samples': 126, 'runtime': 30.18227994814515, 'samples/s': 4.17463492540905, 'tokens/s': 104.8297214602717}
+
+100%|██████████| 1012/1012 [00:30<00:00, 32.90it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.52it/s]
+[rank4] {'num_prompt_tokens': 8327, 'num_generated_tokens': 3232, 'num_samples': 126, 'runtime': 30.19321396574378, 'samples/s': 4.173123144258687, 'tokens/s': 107.04392065273078}
+[rank3] {'num_prompt_tokens': 8911, 'num_generated_tokens': 3550, 'num_samples': 127, 'runtime': 30.195404147729278, 'samples/s': 4.205938075167327, 'tokens/s': 117.56756036885048}
+100%|██████████| 1012/1012 [00:30<00:00, 33.50it/s]
+[rank0] {'num_prompt_tokens': 9270, 'num_generated_tokens': 3854, 'num_samples': 127, 'runtime': 30.21063050441444, 'samples/s': 4.203818254684969, 'tokens/s': 127.57098861067615}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:06:42.851239
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt
++ lang_pair_strs=en2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=en
++ lp=zh2en
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/train.log
+[2025-09-15 18:07:15,681] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:07:22,709] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:07:22,794] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:07:22,913] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:07:23,054] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:07:23,109] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:07:23,154] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:07:23,155] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:07:23,156] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.12s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.22s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.52s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:07:27.162841
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 86549.27 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 7382.84 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:16,  6.65it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.19it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.76it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.18it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.14it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.84it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.89it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.77it/s][A
+ 26%|██▌       | 32/125 [00:04<00:13,  7.08it/s][A
+ 26%|██▌       | 32/125 [00:04<00:13,  6.70it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.94it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.67it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.34it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.16it/s][A
+ 38%|███▊      | 48/125 [00:06<00:10,  7.18it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.40it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.29it/s][A
+ 26%|██▌       | 32/125 [00:07<00:23,  3.93it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.58it/s][A
+ 26%|██▌       | 32/125 [00:08<00:26,  3.55it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.37it/s][A
+ 51%|█████     | 64/125 [00:09<00:08,  6.81it/s][A
+ 51%|█████     | 64/125 [00:09<00:08,  6.83it/s][A
+ 38%|███▊      | 48/125 [00:09<00:16,  4.68it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.27it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.24it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.48it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.46it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:06,  6.69it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  5.84it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.76it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  5.01it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.81it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:08,  5.54it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.43it/s][A
+ 51%|█████     | 64/125 [00:14<00:12,  4.88it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  5.99it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:05,  5.67it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.10it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:04,  5.83it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.26it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.60it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:09,  4.62it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:08,  5.06it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:02,  6.11it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  5.87it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.38it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  6.02it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.40it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  5.37it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.29it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.92it/s][A
+100%|██████████| 125/125 [00:20<00:00,  5.82it/s][A100%|██████████| 125/125 [00:20<00:00,  6.12it/s]
+
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.88it/s][A
+100%|██████████| 125/125 [00:21<00:00,  5.37it/s][A100%|██████████| 125/125 [00:21<00:00,  5.84it/s]
+
+100%|██████████| 125/125 [00:21<00:00,  5.69it/s][A100%|██████████| 125/125 [00:21<00:00,  5.83it/s]
+
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.59it/s][A
+100%|██████████| 125/125 [00:22<00:00,  4.91it/s][A100%|██████████| 125/125 [00:22<00:00,  5.45it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.60it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.91it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.71it/s][A100%|██████████| 125/125 [00:23<00:00,  5.26it/s]
+
+100%|██████████| 125/125 [00:25<00:00,  5.11it/s][A100%|██████████| 125/125 [00:25<00:00,  4.98it/s]
+
+100%|██████████| 125/125 [00:25<00:00,  4.68it/s][A100%|██████████| 125/125 [00:25<00:00,  4.85it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.72it/s][A100%|██████████| 125/125 [00:26<00:00,  4.68it/s]
+ 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s] 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s] 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s] 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s] 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s] 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s] 99%|█████████▉| 1000/1012 [00:26<00:00, 37.30it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:27<00:00, 36.92it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.07it/s][A100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.00it/s][A100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.72it/s][A100%|██████████| 1/1 [00:00<00:00,  1.72it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.48it/s][A100%|██████████| 1/1 [00:00<00:00,  1.47it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A100%|██████████| 2/2 [00:01<00:00,  1.88it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.37it/s][A100%|██████████| 2/2 [00:01<00:00,  1.36it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.36it/s][A100%|██████████| 2/2 [00:01<00:00,  1.36it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.31it/s][A100%|██████████| 2/2 [00:01<00:00,  1.31it/s]
+100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s]100%|██████████| 1012/1012 [00:28<00:00, 34.62it/s]
+100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s]100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s]
+
+100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s]
+100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s]100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s][rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 3574, 'num_samples': 126, 'runtime': 28.608290281146765, 'samples/s': 4.404317726146523, 'tokens/s': 124.92882185117202}
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 3559, 'num_samples': 127, 'runtime': 28.608240945264697, 'samples/s': 4.439280284411242, 'tokens/s': 124.40471285212291}[rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 3402, 'num_samples': 126, 'runtime': 28.608192820101976, 'samples/s': 4.4043327305688535, 'tokens/s': 118.91698372535903}
+
+
+100%|██████████| 1012/1012 [00:28<00:00, 35.38it/s]
+
+[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 3492, 'num_samples': 126, 'runtime': 28.608241276815534, 'samples/s': 4.404325270498608, 'tokens/s': 122.06272892524711}
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 3759, 'num_samples': 127, 'runtime': 28.608320228755474, 'samples/s': 4.439267981639367, 'tokens/s': 131.39534128332585}
+100%|██████████| 1012/1012 [00:28<00:00, 34.74it/s][rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 3468, 'num_samples': 127, 'runtime': 28.608018171042204, 'samples/s': 4.439314853643122, 'tokens/s': 121.22475521601848}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 3460, 'num_samples': 126, 'runtime': 28.607540387660265, 'samples/s': 4.404433177147572, 'tokens/s': 120.947133277227}
+100%|██████████| 1012/1012 [00:28<00:00, 35.36it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 3846, 'num_samples': 127, 'runtime': 28.62280515022576, 'samples/s': 4.437021435650527, 'tokens/s': 134.36838142922778}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:07:59.686706
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt
++ lang_pair_strs=en2zh,zh2en
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' de = zh ']'
++ src_lang=de
++ tgt_lang=zh
++ lp=de2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/train.log
+[2025-09-15 18:08:32,468] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:08:39,439] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:08:39,492] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:08:39,944] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:08:39,996] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:08:40,022] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:08:40,036] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:08:40,037] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:08:40,039] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.42s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.43s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.00s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.40s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.54s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.57s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:08:43.972467
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 186094.77 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5854.56 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.19it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  6.02it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.26it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.82it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.67it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.34it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.28it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.28it/s][A
+ 26%|██▌       | 32/125 [00:04<00:13,  6.73it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.82it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.13it/s][A
+ 26%|██▌       | 32/125 [00:06<00:16,  5.56it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.46it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.44it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.28it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.77it/s][A
+ 26%|██▌       | 32/125 [00:07<00:23,  3.90it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.14it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.79it/s][A
+ 38%|███▊      | 48/125 [00:08<00:15,  5.09it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  5.02it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  5.04it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.34it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.20it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.27it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.25it/s][A
+ 51%|█████     | 64/125 [00:11<00:11,  5.43it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.11it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.22it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.72it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.75it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.70it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.90it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.33it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.04it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:09,  4.59it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.67it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.13it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.68it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.53it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:04,  5.87it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:09,  4.75it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:09,  4.53it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.02it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  6.12it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.06it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  5.43it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  6.01it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.99it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.85it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.80it/s][A
+100%|██████████| 125/125 [00:20<00:00,  6.03it/s][A100%|██████████| 125/125 [00:20<00:00,  5.99it/s]
+
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.22it/s][A
+100%|██████████| 125/125 [00:21<00:00,  6.23it/s][A100%|██████████| 125/125 [00:21<00:00,  5.72it/s]
+
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.64it/s][A
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.16it/s][A
+100%|██████████| 125/125 [00:23<00:00,  4.85it/s][A100%|██████████| 125/125 [00:23<00:00,  5.37it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.84it/s][A
+100%|██████████| 125/125 [00:24<00:00,  5.35it/s][A100%|██████████| 125/125 [00:24<00:00,  5.09it/s]
+
+100%|██████████| 125/125 [00:25<00:00,  4.64it/s][A100%|██████████| 125/125 [00:25<00:00,  5.00it/s]
+
+100%|██████████| 125/125 [00:25<00:00,  5.08it/s][A100%|██████████| 125/125 [00:25<00:00,  4.94it/s]
+
+ 90%|████████▉ | 112/125 [00:25<00:03,  4.20it/s][A
+100%|██████████| 125/125 [00:26<00:00,  4.81it/s][A100%|██████████| 125/125 [00:26<00:00,  4.73it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.32it/s][A100%|██████████| 125/125 [00:28<00:00,  4.42it/s]
+ 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 35.19it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:28<00:00, 34.91it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.11it/s][A100%|██████████| 1/1 [00:00<00:00,  2.11it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.86it/s][A100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.79it/s][A100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.62it/s][A100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A100%|██████████| 2/2 [00:00<00:00,  2.06it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.71it/s][A100%|██████████| 2/2 [00:01<00:00,  1.71it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.31it/s][A100%|██████████| 2/2 [00:01<00:00,  1.31it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.49it/s][A100%|██████████| 2/2 [00:01<00:00,  1.49it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]
+
+100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]
+
+100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]
+[rank5] {'num_prompt_tokens': 11084, 'num_generated_tokens': 3280, 'num_samples': 126, 'runtime': 29.992954773828387, 'samples/s': 4.200986563349423, 'tokens/s': 109.35901529988975}
+[rank7] {'num_prompt_tokens': 10813, 'num_generated_tokens': 3069, 'num_samples': 126, 'runtime': 29.99269779957831, 'samples/s': 4.201022556956231, 'tokens/s': 102.32490656586249}
+[rank4] {'num_prompt_tokens': 11843, 'num_generated_tokens': 3116, 'num_samples': 126, 'runtime': 29.99307836405933, 'samples/s': 4.200969252658829, 'tokens/s': 103.89063643876912}
+[rank1] {'num_prompt_tokens': 11846, 'num_generated_tokens': 3619, 'num_samples': 127, 'runtime': 29.99246566556394, 'samples/s': 4.234396778715527, 'tokens/s': 120.66363733993302}
+[rank2] {'num_prompt_tokens': 12013, 'num_generated_tokens': 3466, 'num_samples': 127, 'runtime': 29.992073640227318, 'samples/s': 4.23445212636646, 'tokens/s': 115.56386669280431}
+100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.17it/s]
+[rank3] {'num_prompt_tokens': 12099, 'num_generated_tokens': 3510, 'num_samples': 127, 'runtime': 29.99277971126139, 'samples/s': 4.234352441574973, 'tokens/s': 117.02816590494612}
+100%|██████████| 1012/1012 [00:29<00:00, 33.74it/s]
+[rank6] {'num_prompt_tokens': 10584, 'num_generated_tokens': 3364, 'num_samples': 126, 'runtime': 29.99066468887031, 'samples/s': 4.201307350375574, 'tokens/s': 112.16823751320182}
+100%|██████████| 1012/1012 [00:30<00:00, 33.26it/s]100%|██████████| 1012/1012 [00:30<00:00, 33.73it/s]
+[rank0] {'num_prompt_tokens': 12235, 'num_generated_tokens': 3687, 'num_samples': 127, 'runtime': 30.00246545486152, 'samples/s': 4.232985458847391, 'tokens/s': 122.88990068323095}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:09:18.217791
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=de
++ lp=zh2de
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/generated_predictions.jsonl
+[2025-09-15 18:09:50,940] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:09:57,863] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:09:58,141] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:09:58,166] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:09:58,325] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:09:58,372] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:09:58,390] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:09:58,393] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 18:09:58,400] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.13s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.32s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.31s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:10:02.476432
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 211049.90 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5973.30 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A[A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.99it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.98it/s][A
+ 13%|█▎        | 16/125 [00:04<00:31,  3.47it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.37it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.31it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.08it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.06it/s][A
+ 13%|█▎        | 16/125 [00:06<00:45,  2.39it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.50it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.21it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.10it/s][A
+ 26%|██▌       | 32/125 [00:08<00:23,  4.01it/s][A
+ 26%|██▌       | 32/125 [00:08<00:23,  3.90it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.77it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.72it/s][A
+ 26%|██▌       | 32/125 [00:10<00:31,  2.91it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.39it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.22it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.92it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.57it/s][A
+ 26%|██▌       | 32/125 [00:13<00:39,  2.35it/s][A
+ 38%|███▊      | 48/125 [00:13<00:22,  3.45it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.71it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.68it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.08it/s][A
+ 38%|███▊      | 48/125 [00:15<00:25,  3.04it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  3.97it/s][A
+ 38%|███▊      | 48/125 [00:16<00:24,  3.11it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.60it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.50it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.25it/s][A
+ 51%|█████     | 64/125 [00:19<00:17,  3.47it/s][A
+ 51%|█████     | 64/125 [00:19<00:19,  3.09it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:11,  3.77it/s][A
+ 51%|█████     | 64/125 [00:20<00:17,  3.44it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:12,  3.57it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:07,  4.09it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:13,  3.31it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.93it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.64it/s][A
+ 64%|██████▍   | 80/125 [00:24<00:14,  3.21it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.69it/s][A
+ 64%|██████▍   | 80/125 [00:25<00:13,  3.41it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.34it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:07,  3.69it/s][A
+ 90%|████████▉ | 112/125 [00:26<00:03,  4.07it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:08,  3.49it/s][A
+ 77%|███████▋  | 96/125 [00:27<00:07,  3.73it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  4.07it/s][A
+ 77%|███████▋  | 96/125 [00:29<00:07,  3.63it/s][A
+100%|██████████| 125/125 [00:29<00:00,  4.12it/s][A100%|██████████| 125/125 [00:29<00:00,  4.25it/s]
+
+ 77%|███████▋  | 96/125 [00:30<00:09,  3.10it/s][A
+100%|██████████| 125/125 [00:30<00:00,  3.97it/s][A100%|██████████| 125/125 [00:30<00:00,  4.13it/s]
+
+ 90%|████████▉ | 112/125 [00:30<00:03,  4.06it/s][A
+ 90%|████████▉ | 112/125 [00:31<00:03,  3.51it/s][A
+100%|██████████| 125/125 [00:32<00:00,  3.79it/s][A100%|██████████| 125/125 [00:32<00:00,  3.89it/s]
+
+ 90%|████████▉ | 112/125 [00:32<00:03,  3.30it/s][A
+ 90%|████████▉ | 112/125 [00:33<00:03,  3.49it/s][A
+100%|██████████| 125/125 [00:33<00:00,  4.02it/s][A100%|██████████| 125/125 [00:33<00:00,  3.72it/s]
+
+ 90%|████████▉ | 112/125 [00:34<00:03,  3.51it/s][A
+100%|██████████| 125/125 [00:36<00:00,  3.33it/s][A100%|██████████| 125/125 [00:36<00:00,  3.46it/s]
+
+100%|██████████| 125/125 [00:36<00:00,  3.16it/s][A100%|██████████| 125/125 [00:36<00:00,  3.41it/s]
+
+100%|██████████| 125/125 [00:37<00:00,  3.38it/s][A100%|██████████| 125/125 [00:37<00:00,  3.31it/s]
+
+100%|██████████| 125/125 [00:38<00:00,  3.44it/s][A100%|██████████| 125/125 [00:38<00:00,  3.28it/s]
+ 99%|█████████▉| 1000/1012 [00:38<00:00, 26.16it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.16it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.16it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 26.01it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.01it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.99it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.94it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 25.93it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.17it/s][A100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.37it/s][A100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.01it/s][A100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.34s/it][A100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
+
+100%|███████���██| 2/2 [00:01<00:00,  1.04it/s][A100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.06it/s][A100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
+
+100%|██████████| 2/2 [00:02<00:00,  1.20s/it][A100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.26s/it][A100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
+100%|██████████| 1012/1012 [00:40<00:00, 24.12it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.12it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.12it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.20it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.17it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.20it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.17it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]
+
+100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s][rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 5133, 'num_samples': 126, 'runtime': 40.987580340355635, 'samples/s': 3.0741019341398563, 'tokens/s': 125.23305736460223}
+[rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 5120, 'num_samples': 126, 'runtime': 40.98803398013115, 'samples/s': 3.074067911163492, 'tokens/s': 124.91450559648476}
+100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]
+
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 5246, 'num_samples': 127, 'runtime': 40.983664052560925, 'samples/s': 3.098795652753849, 'tokens/s': 128.00222042792672}
+[rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 5256, 'num_samples': 126, 'runtime': 40.98761201091111, 'samples/s': 3.0740995588242166, 'tokens/s': 128.23386731095303}
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 5480, 'num_samples': 127, 'runtime': 40.98785710334778, 'samples/s': 3.0984786464874, 'tokens/s': 133.69813372244846}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 5399, 'num_samples': 126, 'runtime': 40.98828110471368, 'samples/s': 3.0740493771403825, 'tokens/s': 131.72057608873752}
+[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 5192, 'num_samples': 127, 'runtime': 40.98800337128341, 'samples/s': 3.0984675893965945, 'tokens/s': 126.67121042635526}
+100%|██████████| 1012/1012 [00:40<00:00, 24.17it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.69it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 5781, 'num_samples': 127, 'runtime': 40.99730667471886, 'samples/s': 3.0977644704234932, 'tokens/s': 141.00926301982847}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:10:47.066087
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' ru = zh ']'
++ src_lang=ru
++ tgt_lang=zh
++ lp=ru2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/train.log
+[2025-09-15 18:11:19,948] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:11:26,238] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:11:26,971] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:11:27,093] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:11:27,262] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:11:27,300] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:11:27,301] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:11:27,326] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:11:27,328] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.56s/it]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:11:30.566843
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 158452.88 examples/s]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.06s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.26s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.42s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5860.16 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A[A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.78it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.68it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.57it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.27it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.26it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.06it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.84it/s][A
+ 13%|█▎        | 16/125 [00:04<00:33,  3.24it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.31it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.53it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.19it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.74it/s][A
+ 38%|███▊      | 48/125 [00:07<00:10,  7.02it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.28it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.18it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.73it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.19it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  4.84it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.21it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.72it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.68it/s][A
+ 51%|█████     | 64/125 [00:10<00:10,  5.65it/s][A
+ 51%|█████     | 64/125 [00:11<00:10,  5.84it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.31it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.22it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.14it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.67it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:08,  5.55it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.61it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:08,  5.35it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.34it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.96it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.54it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.30it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.63it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.45it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.47it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:09,  4.54it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.32it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:06,  4.71it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.30it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.78it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.85it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:05,  5.01it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  4.70it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.48it/s][A
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.38it/s][A
+100%|██████████| 125/125 [00:22<00:00,  5.10it/s][A100%|██████████| 125/125 [00:22<00:00,  5.45it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.61it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.69it/s][A100%|██████████| 125/125 [00:23<00:00,  5.21it/s]
+
+100%|██████████| 125/125 [00:24<00:00,  4.60it/s][A100%|██████████| 125/125 [00:24<00:00,  5.04it/s]
+
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.70it/s][A
+100%|██████████| 125/125 [00:25<00:00,  5.22it/s][A100%|██████████| 125/125 [00:25<00:00,  4.93it/s]
+
+ 90%|████████▉ | 112/125 [00:25<00:03,  4.05it/s][A
+100%|██████████| 125/125 [00:26<00:00,  4.65it/s][A100%|██████████| 125/125 [00:26<00:00,  4.77it/s]
+
+100%|██████████| 125/125 [00:27<00:00,  4.91it/s][A100%|██████████| 125/125 [00:27<00:00,  4.56it/s]
+
+100%|██████████| 125/125 [00:27<00:00,  4.40it/s][A100%|██████████| 125/125 [00:27<00:00,  4.51it/s]
+
+ 26%|██▌       | 32/125 [01:38<05:34,  3.60s/it][A
+ 38%|███▊      | 48/125 [01:41<02:36,  2.04s/it][A
+ 51%|█████     | 64/125 [01:44<01:18,  1.29s/it][A
+ 64%|██████▍   | 80/125 [01:47<00:40,  1.11it/s][A
+ 77%|███████▋  | 96/125 [01:50<00:19,  1.51it/s][A
+ 90%|████████▉ | 112/125 [01:52<00:06,  2.02it/s][A
+100%|██████████| 125/125 [01:56<00:00,  2.36it/s][A100%|██████████| 125/125 [01:56<00:00,  1.08it/s]
+ 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s] 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s] 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s] 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s] 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s] 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s] 99%|█████████▉| 1000/1012 [01:56<00:01,  8.61it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [01:56<00:01,  8.59it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.74it/s][A100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.90it/s][A100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.49it/s][A100%|██████████| 1/1 [00:00<00:00,  1.49it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.40it/s][A100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.14it/s][A100%|██████████| 2/2 [00:00<00:00,  2.14it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.43it/s][A100%|██████████| 2/2 [00:01<00:00,  1.43it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.26it/s][A100%|██████████| 2/2 [00:01<00:00,  1.26it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.53it/s][A100%|██████████| 2/2 [00:01<00:00,  1.53it/s]
+100%|█████████��| 1012/1012 [01:57<00:00,  8.58it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.58it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.58it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.58it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]
+100%|██████████| 1012/1012 [01:57<00:00,  8.58it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]
+
+[rank1] {'num_prompt_tokens': 13219, 'num_generated_tokens': 3491, 'num_samples': 127, 'runtime': 117.78320932015777, 'samples/s': 1.078252161178502, 'tokens/s': 29.639199170662604}
+100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]
+
+[rank2] {'num_prompt_tokens': 13889, 'num_generated_tokens': 4454, 'num_samples': 127, 'runtime': 117.78289232216775, 'samples/s': 1.078255063159945, 'tokens/s': 37.815338986727525}
+[rank6] {'num_prompt_tokens': 11855, 'num_generated_tokens': 3368, 'num_samples': 126, 'runtime': 117.78370458632708, 'samples/s': 1.0697574884618353, 'tokens/s': 28.594787469360803}
+[rank3] {'num_prompt_tokens': 13378, 'num_generated_tokens': 3479, 'num_samples': 127, 'runtime': 117.78287390060723, 'samples/s': 1.0782552318019576, 'tokens/s': 29.537401192433155}
+100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s][rank5] {'num_prompt_tokens': 11659, 'num_generated_tokens': 3256, 'num_samples': 126, 'runtime': 117.7827379014343, 'samples/s': 1.0697662683426687, 'tokens/s': 27.64411880733119}
+100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]
+100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s][rank4] {'num_prompt_tokens': 12648, 'num_generated_tokens': 3091, 'num_samples': 126, 'runtime': 117.78364958241582, 'samples/s': 1.0697579880290178, 'tokens/s': 26.243031277759474}
+100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]
+100%|██████████| 1012/1012 [01:57<00:00,  8.59it/s]
+[rank7] {'num_prompt_tokens': 11527, 'num_generated_tokens': 3201, 'num_samples': 126, 'runtime': 117.78293089941144, 'samples/s': 1.069764515433956, 'tokens/s': 27.177112808762644}
+[rank0] {'num_prompt_tokens': 15095, 'num_generated_tokens': 3567, 'num_samples': 127, 'runtime': 117.79150041751564, 'samples/s': 1.0781762652640008, 'tokens/s': 30.28232077320229}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:13:33.004278
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=ru
++ lp=zh2ru
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/train.log
+[2025-09-15 18:14:05,809] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:14:12,857] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:14:12,947] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:14:12,980] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:14:13,242] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:14:13,252] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:14:13,265] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:14:13,297] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:14:13,298] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.23s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.34s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.91s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.05s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.23s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:14:17.530414
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 151914.24 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5794.55 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s]
+[A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.99it/s][A
+ 13%|█▎        | 16/125 [00:04<00:30,  3.55it/s][A
+ 13%|█▎        | 16/125 [00:04<00:33,  3.30it/s][A
+ 13%|█▎        | 16/125 [00:05<00:34,  3.12it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.09it/s][A
+ 13%|█▎        | 16/125 [00:05<00:36,  2.98it/s][A
+ 13%|█▎        | 16/125 [00:06<00:41,  2.61it/s][A
+ 13%|█▎        | 16/125 [00:06<00:46,  2.32it/s][A
+ 26%|██▌       | 32/125 [00:08<00:22,  4.05it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.82it/s][A
+ 26%|██▌       | 32/125 [00:09<00:26,  3.53it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.32it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.34it/s][A
+ 26%|██▌       | 32/125 [00:10<00:28,  3.31it/s][A
+ 26%|██▌       | 32/125 [00:11<00:35,  2.63it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  4.00it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.91it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.54it/s][A
+ 26%|██▌       | 32/125 [00:14<00:41,  2.26it/s][A
+ 38%|███▊      | 48/125 [00:14<00:22,  3.39it/s][A
+ 38%|███▊      | 48/125 [00:15<00:25,  3.03it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  4.03it/s][A
+ 38%|███▊      | 48/125 [00:16<00:26,  2.90it/s][A
+ 38%|███▊      | 48/125 [00:17<00:29,  2.60it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.73it/s][A
+ 38%|███▊      | 48/125 [00:17<00:26,  2.90it/s][A
+ 51%|█████     | 64/125 [00:17<00:17,  3.47it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.22it/s][A
+ 51%|█████     | 64/125 [00:20<00:18,  3.30it/s][A
+ 51%|█████     | 64/125 [00:20<00:20,  3.03it/s][A
+ 51%|█████     | 64/125 [00:21<00:20,  2.97it/s][A
+ 51%|█████     | 64/125 [00:22<00:22,  2.70it/s][A
+ 51%|█████     | 64/125 [00:22<00:19,  3.06it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:13,  3.36it/s][A
+ 64%|██████���   | 80/125 [00:23<00:14,  3.19it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.88it/s][A
+ 64%|██████▍   | 80/125 [00:25<00:14,  3.11it/s][A
+ 64%|██████▍   | 80/125 [00:26<00:14,  3.09it/s][A
+ 64%|██████▍   | 80/125 [00:27<00:15,  2.85it/s][A
+ 64%|██████▍   | 80/125 [00:27<00:15,  2.86it/s][A
+ 77%|███████▋  | 96/125 [00:27<00:08,  3.41it/s][A
+ 64%|██████▍   | 80/125 [00:28<00:14,  3.01it/s][A
+ 77%|███████▋  | 96/125 [00:28<00:09,  3.22it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:03,  3.56it/s][A
+ 77%|███████▋  | 96/125 [00:30<00:08,  3.36it/s][A
+ 77%|███████▋  | 96/125 [00:30<00:09,  3.13it/s][A
+ 90%|████████▉ | 112/125 [00:31<00:03,  3.61it/s][A
+ 77%|███████▋  | 96/125 [00:31<00:09,  3.07it/s][A
+ 77%|███████▋  | 96/125 [00:32<00:08,  3.27it/s][A
+ 90%|████████▉ | 112/125 [00:32<00:03,  3.37it/s][A
+ 77%|███████▋  | 96/125 [00:33<00:10,  2.78it/s][A
+100%|██████████| 125/125 [00:33<00:00,  3.44it/s][A100%|██████████| 125/125 [00:33<00:00,  3.69it/s]
+
+ 90%|████████▉ | 112/125 [00:34<00:03,  3.69it/s][A
+100%|██████████| 125/125 [00:36<00:00,  3.37it/s][A100%|██████████| 125/125 [00:36<00:00,  3.46it/s]
+
+ 90%|████████▉ | 112/125 [00:36<00:04,  3.18it/s][A
+ 90%|████████▉ | 112/125 [00:36<00:04,  3.08it/s][A
+ 90%|████████▉ | 112/125 [00:37<00:04,  3.19it/s][A
+ 90%|████████▉ | 112/125 [00:37<00:04,  2.85it/s][A
+100%|██████████| 125/125 [00:37<00:00,  3.73it/s][A100%|██████████| 125/125 [00:37<00:00,  3.34it/s]
+
+100%|██████████| 125/125 [00:37<00:00,  3.20it/s][A100%|██████████| 125/125 [00:37<00:00,  3.34it/s]
+
+100%|██████████| 125/125 [00:41<00:00,  3.07it/s][A100%|██████████| 125/125 [00:41<00:00,  3.02it/s]
+
+100%|██████████| 125/125 [00:41<00:00,  2.87it/s][A100%|██████████| 125/125 [00:41<00:00,  2.99it/s]
+
+100%|██████████| 125/125 [00:42<00:00,  3.01it/s][A100%|██████████| 125/125 [00:42<00:00,  2.95it/s]
+
+100%|██████████| 125/125 [00:42<00:00,  2.83it/s][A100%|██████████| 125/125 [00:42<00:00,  2.94it/s]
+ 99%|█████████▉| 1000/1012 [00:42<00:00, 23.44it/s] 99%|█████████▉| 1000/1012 [00:42<00:00, 23.44it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:42<00:00, 23.42it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:42<00:00, 23.32it/s] 99%|█████████▉| 1000/1012 [00:42<00:00, 23.32it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:43<00:00, 23.25it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:43<00:00, 23.24it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:43<00:00, 23.23it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:01<00:00,  1.19s/it][A100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
+
+100%|██████████| 1/1 [00:00<00:00,  1.02it/s][A100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.09s/it][A100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.35s/it][A100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.04s/it][A100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.08s/it][A100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.22s/it][A100%|██████████| 2/2 [00:02<00:00,  1.22s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.43s/it][A100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
+100%|██████████| 1012/1012 [00:45<00:00, 21.74it/s]100%|██████████| 1012/1012 [00:45<00:00, 21.82it/s]100%|██████████| 1012/1012 [00:45<00:00, 21.79it/s]100%|██████████| 1012/1012 [00:45<00:00, 21.79it/s]100%|██████████| 1012/1012 [00:45<00:00, 21.81it/s]100%|██████████| 1012/1012 [00:45<00:00, 21.81it/s]100%|██████████| 1012/1012 [00:45<00:00, 21.74it/s]100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]
+
+
+100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]
+
+100%|██████████| 1012/1012 [00:45<00:00, 22.22it/s]
+
+[rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 5738, 'num_samples': 126, 'runtime': 45.54954341612756, 'samples/s': 2.7662187269123675, 'tokens/s': 125.97272265891401}
+[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 5969, 'num_samples': 127, 'runtime': 45.549226896837354, 'samples/s': 2.788192218665693, 'tokens/s': 131.04503427728758}
+[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 5793, 'num_samples': 126, 'runtime': 45.54951770789921, 'samples/s': 2.7662202881711093, 'tokens/s': 127.18027086805742}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 5888, 'num_samples': 126, 'runtime': 45.54938623122871, 'samples/s': 2.7662282727668077, 'tokens/s': 129.26628627024573}
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 6136, 'num_samples': 127, 'runtime': 45.549255868420005, 'samples/s': 2.788190445237351, 'tokens/s': 134.71131159036526}
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 6121, 'num_samples': 127, 'runtime': 45.54938234016299, 'samples/s': 2.788182703588897, 'tokens/s': 134.38162463517824}[rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 6056, 'num_samples': 126, 'runtime': 45.54765916801989, 'samples/s': 2.7663331618250897, 'tokens/s': 132.9596319683551}
+
+100%|██████████| 1012/1012 [00:45<00:00, 21.74it/s]100%|██████████| 1012/1012 [00:45<00:00, 22.21it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 6529, 'num_samples': 127, 'runtime': 45.5646997038275, 'samples/s': 2.787245407640244, 'tokens/s': 143.29075012978862}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:15:06.953017
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' bn = zh ']'
++ src_lang=bn
++ tgt_lang=zh
++ lp=bn2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/train.log
+[2025-09-15 18:15:39,916] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:15:46,883] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:15:46,978] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:15:47,113] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:15:47,147] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:15:47,209] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:15:47,362] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 18:15:47,440] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:15:47,457] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.49s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.00it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.50s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:15:51.410866
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 153690.91 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5624.40 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.74it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.50it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.07it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.95it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.69it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.64it/s][A
+ 13%|█▎        | 16/125 [00:05<00:36,  2.97it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.78it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.42it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.27it/s][A
+ 26%|██▌       | 32/125 [00:08<00:23,  3.97it/s][A
+ 26%|██▌       | 32/125 [00:08<00:26,  3.57it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.14it/s][A
+ 26%|██▌       | 32/125 [00:10<00:30,  3.06it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.60it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.54it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  4.90it/s][A
+ 38%|███▊      | 48/125 [00:12<00:22,  3.50it/s][A
+ 38%|███▊      | 48/125 [00:13<00:20,  3.67it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.61it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.67it/s][A
+ 51%|█████     | 64/125 [00:16<00:16,  3.76it/s][A
+ 51%|█████     | 64/125 [00:16<00:16,  3.68it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.29it/s][A
+ 51%|█████     | 64/125 [00:17<00:17,  3.53it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.45it/s][A
+ 51%|█████     | 64/125 [00:18<00:17,  3.54it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:11,  3.89it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.54it/s][A
+ 64%|██████▍   | 80/125 [00:21<00:11,  3.85it/s][A
+ 64%|██████▍   | 80/125 [00:21<00:12,  3.57it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.37it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.58it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.46it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.77it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.93it/s][A
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.63it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.91it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.71it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:07,  3.97it/s][A
+100%|██████████| 125/125 [00:26<00:00,  4.53it/s][A100%|██████████| 125/125 [00:26<00:00,  4.68it/s]
+
+100%|██████████| 125/125 [00:27<00:00,  4.99it/s][A100%|██████████| 125/125 [00:27<00:00,  4.49it/s]
+
+ 90%|████████▉ | 112/125 [00:30<00:03,  3.98it/s][A
+ 90%|████████▉ | 112/125 [00:30<00:03,  3.51it/s][A
+100%|██████████| 125/125 [00:30<00:00,  3.57it/s][A100%|██████████| 125/125 [00:30<00:00,  4.06it/s]
+
+ 90%|████████▉ | 112/125 [00:31<00:03,  3.26it/s][A
+100%|██████████| 125/125 [00:34<00:00,  3.85it/s][A100%|██████████| 125/125 [00:34<00:00,  3.68it/s]
+
+100%|██████████| 125/125 [00:34<00:00,  3.49it/s][A100%|██████████| 125/125 [00:34<00:00,  3.59it/s]
+
+100%|██████████| 125/125 [00:35<00:00,  3.18it/s][A100%|██████████| 125/125 [00:35<00:00,  3.52it/s]
+
+ 13%|█▎        | 16/125 [01:43<11:46,  6.48s/it][A
+ 26%|██▌       | 32/125 [01:46<04:18,  2.78s/it][A
+ 38%|███▊      | 48/125 [01:49<02:02,  1.60s/it][A
+ 51%|█████     | 64/125 [01:52<01:03,  1.04s/it][A
+ 26%|██▌       | 32/125 [01:53<06:23,  4.13s/it][A
+ 64%|██████▍   | 80/125 [01:56<00:33,  1.34it/s][A
+ 38%|███▊      | 48/125 [01:58<03:02,  2.37s/it][A
+ 51%|█████     | 64/125 [02:01<01:32,  1.51s/it][A
+ 77%|███████▋  | 96/125 [02:01<00:17,  1.64it/s][A
+ 90%|████████▉ | 112/125 [02:05<00:06,  2.07it/s][A
+ 64%|██████▍   | 80/125 [02:05<00:47,  1.07s/it][A
+100%|██████████| 125/125 [02:08<00:00,  2.37it/s][A100%|██████████| 125/125 [02:08<00:00,  1.03s/it]
+
+ 77%|███████▋  | 96/125 [02:10<00:23,  1.23it/s][A
+ 90%|████████▉ | 112/125 [02:13<00:07,  1.65it/s][A
+100%|██████████| 125/125 [02:16<00:00,  1.96it/s][A100%|██████████| 125/125 [02:16<00:00,  1.10s/it]
+ 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s] 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s] 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s] 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s] 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s] 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s] 99%|█████████▉| 1000/1012 [02:17<00:01,  7.29it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:17<00:01,  7.28it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  3.18it/s][A100%|██████████| 1/1 [00:00<00:00,  3.17it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.93it/s][A100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.85it/s][A100%|██████████| 1/1 [00:00<00:00,  1.85it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.19it/s][A100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A100%|██████████| 2/2 [00:01<00:00,  1.94it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.42it/s][A100%|██████████| 2/2 [00:01<00:00,  1.42it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.39it/s][A100%|██████████| 2/2 [00:01<00:00,  1.39it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.03it/s][A100%|██████████| 2/2 [00:01<00:00,  1.03it/s]
+100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s]100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s]100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]
+100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s]100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]
+100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s][rank7] {'num_prompt_tokens': 29724, 'num_generated_tokens': 3358, 'num_samples': 126, 'runtime': 139.05815846845508, 'samples/s': 0.9060957040401388, 'tokens/s': 24.148169636244337}
+100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s]100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s][rank1] {'num_prompt_tokens': 34238, 'num_generated_tokens': 3614, 'num_samples': 127, 'runtime': 139.0578108187765, 'samples/s': 0.9132892230376723, 'tokens/s': 25.989190961087775}
+100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]
+100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]
+100%|██████████| 1012/1012 [02:19<00:00,  7.27it/s][rank6] {'num_prompt_tokens': 29317, 'num_generated_tokens': 4381, 'num_samples': 126, 'runtime': 139.058183318004, 'samples/s': 0.9060955421218037, 'tokens/s': 31.504798174885888}
+100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s][rank2] {'num_prompt_tokens': 33610, 'num_generated_tokens': 4493, 'num_samples': 127, 'runtime': 139.04952109605074, 'samples/s': 0.9133436706500605, 'tokens/s': 32.31222923016316}
+
+
+100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s][rank4] {'num_prompt_tokens': 32349, 'num_generated_tokens': 3201, 'num_samples': 126, 'runtime': 139.0582670904696, 'samples/s': 0.9060949962653134, 'tokens/s': 23.019127643216414}
+[rank3] {'num_prompt_tokens': 33748, 'num_generated_tokens': 3509, 'num_samples': 127, 'runtime': 139.05494652315974, 'samples/s': 0.9133080352438093, 'tokens/s': 25.23462909976793}
+100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]
+[rank5] {'num_prompt_tokens': 31313, 'num_generated_tokens': 3263, 'num_samples': 126, 'runtime': 139.05831094644964, 'samples/s': 0.9060947105025725, 'tokens/s': 23.464976510872177}
+100%|██████████| 1012/1012 [02:19<00:00,  7.28it/s]
+[rank0] {'num_prompt_tokens': 34392, 'num_generated_tokens': 3766, 'num_samples': 127, 'runtime': 139.06745819561183, 'samples/s': 0.9132258664091079, 'tokens/s': 27.080382778714174}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:18:14.417224
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=bn
++ lp=zh2bn
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/train.log
+[2025-09-15 18:18:48,820] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:18:55,764] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:18:55,776] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:18:55,829] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:18:56,067] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:18:56,237] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:18:56,246] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:18:56,247] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:18:56,253] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.23s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.45s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:02<00:02,  2.06s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.56s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.45s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:19:00.657853
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 132624.14 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.50s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.50s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.50s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.51s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.52s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.51s/it]
+
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 9559.80 examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5401.46 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:10<01:14,  1.46it/s][A
+ 13%|█▎        | 16/125 [00:12<01:26,  1.26it/s][A
+ 13%|█▎        | 16/125 [00:13<01:34,  1.15it/s][A
+ 13%|█▎        | 16/125 [00:14<01:39,  1.09it/s][A
+ 26%|██▌       | 32/125 [00:26<01:15,  1.24it/s][A
+ 26%|██▌       | 32/125 [00:29<01:26,  1.07it/s][A
+ 26%|██▌       | 32/125 [00:31<01:35,  1.03s/it][A
+ 38%|███▊      | 48/125 [00:42<01:06,  1.15it/s][A
+ 51%|█████     | 64/125 [00:57<00:54,  1.12it/s][A
+ 13%|█▎        | 16/125 [01:31<10:23,  5.72s/it][A
+ 13%|█▎        | 16/125 [01:34<10:43,  5.91s/it][A
+ 13%|█▎        | 16/125 [01:34<10:44,  5.92s/it][A
+ 13%|█▎        | 16/125 [01:36<10:54,  6.00s/it][A
+ 26%|██▌       | 32/125 [01:43<05:42,  3.68s/it][A
+ 26%|██▌       | 32/125 [01:56<04:59,  3.22s/it][A
+ 38%|███▊      | 48/125 [01:57<03:54,  3.05s/it][A
+ 38%|███▊      | 48/125 [02:02<03:16,  2.56s/it][A
+ 38%|███▊      | 48/125 [02:04<04:07,  3.21s/it][A
+ 51%|█████     | 64/125 [02:16<01:54,  1.87s/it][A
+ 51%|█████     | 64/125 [02:16<02:16,  2.24s/it][A
+ 64%|██████▍   | 80/125 [02:30<02:00,  2.67s/it][A
+ 77%|███████▋  | 96/125 [02:42<00:58,  2.02s/it][A
+ 26%|██▌       | 32/125 [03:00<08:43,  5.63s/it][A
+ 90%|████████▉ | 112/125 [03:02<00:22,  1.77s/it][A
+ 26%|██▌       | 32/125 [03:05<08:57,  5.78s/it][A
+ 26%|██▌       | 32/125 [03:08<09:05,  5.87s/it][A
+ 38%|███▊      | 48/125 [03:12<04:21,  3.39s/it][A
+100%|██████████| 125/125 [03:16<00:00,  1.58s/it][A100%|██████████| 125/125 [03:16<00:00,  1.57s/it]
+
+ 38%|███▊      | 48/125 [03:22<04:39,  3.63s/it][A
+ 51%|█████     | 64/125 [03:27<04:07,  4.05s/it][A
+ 38%|███▊      | 48/125 [03:27<05:35,  4.36s/it][A
+ 51%|█████     | 64/125 [03:28<02:29,  2.44s/it][A
+ 64%|██████▍   | 80/125 [03:40<02:10,  2.89s/it][A
+ 51%|█████     | 64/125 [03:40<02:41,  2.64s/it][A
+ 51%|█████     | 64/125 [03:42<03:03,  3.01s/it][A
+ 64%|██████▍   | 80/125 [03:43<01:25,  1.91s/it][A
+ 64%|██████▍   | 80/125 [03:48<02:37,  3.50s/it][A
+ 64%|██████▍   | 80/125 [03:50<02:29,  3.32s/it][A
+ 77%|███████▋  | 96/125 [03:53<01:03,  2.18s/it][A
+ 64%|██████▍   | 80/125 [03:59<01:43,  2.30s/it][A
+ 77%|███████▋  | 96/125 [04:00<01:10,  2.42s/it][A
+ 90%|████████▉ | 112/125 [04:13<00:24,  1.89s/it][A
+ 38%|███▊      | 48/125 [04:40<07:29,  5.83s/it][A
+ 51%|█████     | 64/125 [04:53<03:54,  3.85s/it][A
+ 64%|██████▍   | 80/125 [05:09<02:07,  2.83s/it][A
+ 64%|██████▍   | 80/125 [05:13<02:49,  3.78s/it][A
+ 77%|███████▋  | 96/125 [05:15<01:33,  3.21s/it][A
+ 77%|███████▋  | 96/125 [05:20<02:03,  4.26s/it][A
+ 90%|████████▉ | 112/125 [05:23<00:43,  3.31s/it][A
+ 90%|████████▉ | 112/125 [05:27<00:31,  2.41s/it][A
+ 77%|███████▋  | 96/125 [05:31<01:40,  3.48s/it][A
+100%|██████████| 125/125 [05:33<00:00,  3.06s/it][A100%|██████████| 125/125 [05:33<00:00,  2.67s/it]
+
+100%|██████████| 125/125 [05:34<00:00,  2.63s/it][A100%|██████████| 125/125 [05:34<00:00,  2.67s/it]
+
+ 90%|████████▉ | 112/125 [05:35<00:41,  3.18s/it][A
+100%|██████████| 125/125 [05:39<00:00,  2.01s/it][A100%|██████████| 125/125 [05:39<00:00,  2.72s/it]
+
+ 90%|████████▉ | 112/125 [05:49<00:35,  2.70s/it][A
+100%|██████████| 125/125 [06:04<00:00,  2.27s/it][A100%|██████████| 125/125 [06:04<00:00,  2.91s/it]
+
+ 77%|███████▋  | 96/125 [06:43<01:51,  3.85s/it][A
+ 77%|███████▋  | 96/125 [06:43<02:08,  4.42s/it][A
+100%|██████████| 125/125 [06:55<00:00,  3.99s/it][A100%|██████████| 125/125 [06:55<00:00,  3.32s/it]
+
+ 90%|████████▉ | 112/125 [06:59<00:38,  2.93s/it][A
+100%|██████████| 125/125 [07:11<00:00,  2.37s/it][A100%|██████████| 125/125 [07:11<00:00,  3.45s/it]
+
+ 90%|████████▉ | 112/125 [08:14<01:02,  4.83s/it][A
+100%|██████████| 125/125 [08:25<00:00,  3.74s/it][A100%|██████████| 125/125 [08:25<00:00,  4.05s/it]
+ 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s] 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s] 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s] 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s] 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s] 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s] 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [08:26<00:06,  1.98it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:02<00:00,  2.55s/it][A100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.12s/it][A100%|██████████| 1/1 [00:03<00:00,  3.13s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.23s/it][A100%|██████████| 1/1 [00:03<00:00,  3.23s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.36s/it][A100%|██████████| 1/1 [00:04<00:00,  4.36s/it]
+
+100%|██████████| 2/2 [00:04<00:00,  2.37s/it][A100%|██████████| 2/2 [00:04<00:00,  2.37s/it]
+
+100%|██████████| 2/2 [00:05<00:00,  2.86s/it][A100%|██████████| 2/2 [00:05<00:00,  2.86s/it]
+
+100%|██████████| 2/2 [00:07<00:00,  3.92s/it][A100%|██████████| 2/2 [00:07<00:00,  3.92s/it]
+
+100%|██████████| 2/2 [00:08<00:00,  4.38s/it][A100%|██████████| 2/2 [00:08<00:00,  4.38s/it]
+100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██��███████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]
+100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]
+
+
+100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]
+100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]
+[rank1] {'num_prompt_tokens': 10543, 'num_generated_tokens': 20381, 'num_samples': 127, 'runtime': 514.8834198154509, 'samples/s': 0.24665777749363238, 'tokens/s': 39.5837178196671}
+[rank3] {'num_prompt_tokens': 10346, 'num_generated_tokens': 18409, 'num_samples': 127, 'runtime': 514.8832867275923, 'samples/s': 0.2466578412501307, 'tokens/s': 35.75373385491068}
+[rank2] {'num_prompt_tokens': 10036, 'num_generated_tokens': 20857, 'num_samples': 127, 'runtime': 514.8835607152432, 'samples/s': 0.24665770999481854, 'tokens/s': 40.50818785324354}
+[rank5] {'num_prompt_tokens': 9025, 'num_generated_tokens': 19513, 'num_samples': 126, 'runtime': 514.8818496353924, 'samples/s': 0.24471633655221178, 'tokens/s': 37.898014882089754}
+100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]
+[rank7] {'num_prompt_tokens': 8723, 'num_generated_tokens': 22950, 'num_samples': 126, 'runtime': 514.8791204933077, 'samples/s': 0.2447176336831816, 'tokens/s': 44.573568992293794}
+[rank4] {'num_prompt_tokens': 8829, 'num_generated_tokens': 24017, 'num_samples': 126, 'runtime': 514.8824920766056, 'samples/s': 0.2447160312090266, 'tokens/s': 46.6455946154539}
+[rank6] {'num_prompt_tokens': 9027, 'num_generated_tokens': 22628, 'num_samples': 126, 'runtime': 514.8836904559284, 'samples/s': 0.24471546163838917, 'tokens/s': 43.94778941232913}
+100%|██████████| 1012/1012 [08:34<00:00,  1.96it/s]100%|██████████| 1012/1012 [08:34<00:00,  1.97it/s]
+[rank0] {'num_prompt_tokens': 10826, 'num_generated_tokens': 24046, 'num_samples': 127, 'runtime': 514.8984199203551, 'samples/s': 0.24665059181895424, 'tokens/s': 46.700473471484834}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:27:39.425154
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' hi = zh ']'
++ src_lang=hi
++ tgt_lang=zh
++ lp=hi2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/train.log
+[2025-09-15 18:28:12,822] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:28:19,870] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:28:19,884] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:28:20,026] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:28:20,123] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:28:20,260] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:28:20,268] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:28:20,305] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:28:20,307] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.33s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.42s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:28:24.366781
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 106262.00 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6597.54 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A[A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.96it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.91it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.47it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.24it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.19it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.14it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.74it/s][A
+ 13%|█▎        | 16/125 [00:05<00:39,  2.75it/s][A
+ 26%|██▌       | 32/125 [00:06<00:16,  5.50it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.00it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.81it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.54it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.57it/s][A
+ 26%|██▌       | 32/125 [00:07<00:23,  3.98it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.75it/s][A
+ 26%|██▌       | 32/125 [00:08<00:26,  3.56it/s][A
+ 38%|███▊      | 48/125 [00:10<00:15,  4.91it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.46it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.28it/s][A
+ 26%|██▌       | 32/125 [00:11<00:31,  2.93it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.29it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.17it/s][A
+ 38%|███▊      | 48/125 [00:12<00:21,  3.57it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.10it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  5.01it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.57it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.51it/s][A
+ 38%|███▊      | 48/125 [00:14<00:21,  3.61it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.30it/s][A
+ 51%|█████     | 64/125 [00:15<00:15,  3.92it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.64it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.61it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:09,  4.70it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:11,  4.06it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.21it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.97it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  3.78it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.28it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:11,  3.78it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.61it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.23it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.31it/s][A
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.03it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:07,  4.04it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:14,  3.15it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.95it/s][A
+ 64%|██████▍   | 80/125 [00:24<00:13,  3.38it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.45it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.99it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.34it/s][A
+100%|██████████| 125/125 [00:25<00:00,  4.55it/s][A100%|██████████| 125/125 [00:25<00:00,  4.84it/s]
+
+ 90%|████████▉ | 112/125 [00:26<00:03,  3.97it/s][A
+ 90%|████████▉ | 112/125 [00:26<00:03,  3.87it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:03,  4.28it/s][A
+ 77%|███████▋  | 96/125 [00:27<00:07,  3.68it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  4.33it/s][A
+100%|██████████| 125/125 [00:28<00:00,  4.19it/s][A100%|██████████| 125/125 [00:28<00:00,  4.31it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.41it/s][A100%|██████████| 125/125 [00:29<00:00,  4.19it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.02it/s][A100%|██████████| 125/125 [00:29<00:00,  4.18it/s]
+
+100%|██████████| 125/125 [00:30<00:00,  3.83it/s][A100%|██████████| 125/125 [00:30<00:00,  4.12it/s]
+
+100%|██████████| 125/125 [00:30<00:00,  4.78it/s][A100%|██████████| 125/125 [00:30<00:00,  4.10it/s]
+
+ 90%|████████▉ | 112/125 [00:33<00:03,  3.44it/s][A
+100%|██████████| 125/125 [00:36<00:00,  3.62it/s][A100%|██████████| 125/125 [00:36<00:00,  3.45it/s]
+
+100%|██████████| 125/125 [01:57<00:00,  2.10s/it][A100%|██████████| 125/125 [01:57<00:00,  1.07it/s]
+ 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s] 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s] 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s] 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s] 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s] 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s] 99%|█████████▉| 1000/1012 [01:57<00:01,  8.51it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s]
+[A
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [01:57<00:01,  8.49it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  3.11it/s][A100%|██████████| 1/1 [00:00<00:00,  3.11it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.57it/s][A100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.55it/s][A100%|██████████| 1/1 [00:00<00:00,  1.55it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.48it/s][A100%|██████████| 1/1 [00:00<00:00,  1.48it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A100%|██████████| 2/2 [00:00<00:00,  2.06it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.50it/s][A100%|██████████| 2/2 [00:01<00:00,  1.50it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.47it/s][A100%|██████████| 2/2 [00:01<00:00,  1.47it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.57it/s][A100%|██████████| 2/2 [00:01<00:00,  1.57it/s]
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]
+
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s][rank4] {'num_prompt_tokens': 28977, 'num_generated_tokens': 3196, 'num_samples': 126, 'runtime': 119.04465030506253, 'samples/s': 1.058426394442033, 'tokens/s': 26.847069497116966}
+[rank1] {'num_prompt_tokens': 28921, 'num_generated_tokens': 4568, 'num_samples': 127, 'runtime': 119.04437659680843, 'samples/s': 1.0668290567822156, 'tokens/s': 38.37224512898552}
+[rank2] {'num_prompt_tokens': 28690, 'num_generated_tokens': 3371, 'num_samples': 127, 'runtime': 119.04456693865359, 'samples/s': 1.06682735101591, 'tokens/s': 28.317125986414432}
+
+[rank3] {'num_prompt_tokens': 28563, 'num_generated_tokens': 3444, 'num_samples': 127, 'runtime': 119.04122011177242, 'samples/s': 1.0668573447143332, 'tokens/s': 28.931155080284753}
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]
+[rank7] {'num_prompt_tokens': 27316, 'num_generated_tokens': 3139, 'num_samples': 126, 'runtime': 119.04476114362478, 'samples/s': 1.0584254089769132, 'tokens/s': 26.368233006178812}
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]
+[rank6] {'num_prompt_tokens': 26138, 'num_generated_tokens': 3269, 'num_samples': 126, 'runtime': 119.04463176988065, 'samples/s': 1.0584265592384243, 'tokens/s': 27.46028906468579}
+[rank5] {'num_prompt_tokens': 29156, 'num_generated_tokens': 3307, 'num_samples': 126, 'runtime': 119.0449253115803, 'samples/s': 1.0584239493637881, 'tokens/s': 27.779428575762278}
+100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]100%|██████████| 1012/1012 [01:59<00:00,  8.50it/s]
+[rank0] {'num_prompt_tokens': 29088, 'num_generated_tokens': 3809, 'num_samples': 127, 'runtime': 119.05356797762215, 'samples/s': 1.066746693588146, 'tokens/s': 31.994001227379904}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:30:27.215550
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=hi
++ lp=zh2hi
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/train.log
+[2025-09-15 18:31:01,841] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:31:08,850] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:31:08,904] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 18:31:08,948] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:31:09,082] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:31:09,127] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:31:09,267] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:31:09,286] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:31:09,290] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.77s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.49s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:31:13.449405
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 84669.19 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6534.08 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s]
+[A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:12<01:22,  1.32it/s][A
+ 13%|█▎        | 16/125 [00:12<01:26,  1.27it/s][A
+ 13%|█▎        | 16/125 [00:14<01:37,  1.12it/s][A
+ 13%|█▎        | 16/125 [00:18<02:05,  1.15s/it][A
+ 26%|██▌       | 32/125 [00:21<01:00,  1.54it/s][A
+ 26%|██▌       | 32/125 [00:24<01:11,  1.29it/s][A
+ 26%|██▌       | 32/125 [00:25<01:13,  1.27it/s][A
+ 26%|██▌       | 32/125 [00:29<01:23,  1.11it/s][A
+ 38%|███▊      | 48/125 [00:31<00:49,  1.54it/s][A
+ 38%|███▊      | 48/125 [00:36<00:56,  1.36it/s][A
+ 38%|███▊      | 48/125 [00:37<01:00,  1.27it/s][A
+ 38%|███▊      | 48/125 [00:44<01:08,  1.12it/s][A
+ 51%|█████     | 64/125 [00:48<00:49,  1.24it/s][A
+ 51%|█████     | 64/125 [00:49<00:47,  1.29it/s][A
+ 51%|█████     | 64/125 [00:51<00:49,  1.22it/s][A
+ 64%|██████▍   | 80/125 [01:04<00:36,  1.23it/s][A
+ 64%|██████▍   | 80/125 [01:07<00:42,  1.05it/s][A
+ 77%|███████▋  | 96/125 [01:14<00:21,  1.33it/s][A
+ 77%|███████▋  | 96/125 [01:20<00:25,  1.12it/s][A
+ 90%|████████▉ | 112/125 [01:29<00:10,  1.25it/s][A
+ 90%|████████▉ | 112/125 [01:32<00:11,  1.14it/s][A
+ 13%|█▎        | 16/125 [01:33<10:36,  5.84s/it][A
+ 13%|█▎        | 16/125 [01:34<10:40,  5.88s/it][A
+ 13%|█▎        | 16/125 [01:34<10:41,  5.89s/it][A
+ 13%|█▎        | 16/125 [01:36<10:56,  6.02s/it][A
+100%|██████████| 125/125 [01:40<00:00,  1.25it/s][A100%|██████████| 125/125 [01:40<00:00,  1.25it/s]
+
+100%|██████████| 125/125 [01:44<00:00,  1.14it/s][A100%|██████████| 125/125 [01:44<00:00,  1.20it/s]
+
+ 26%|██▌       | 32/125 [01:46<04:27,  2.88s/it][A
+ 26%|██▌       | 32/125 [01:48<04:34,  2.95s/it][A
+ 38%|███▊      | 48/125 [01:58<02:26,  1.90s/it][A
+ 38%|███▊      | 48/125 [01:58<02:27,  1.92s/it][A
+ 51%|█████     | 64/125 [02:09<01:27,  1.43s/it][A
+ 51%|█████     | 64/125 [02:09<01:27,  1.43s/it][A
+ 51%|█████     | 64/125 [02:17<02:53,  2.85s/it][A
+ 64%|██████▍   | 80/125 [02:22<01:56,  2.58s/it][A
+ 64%|██████▍   | 80/125 [02:22<00:53,  1.20s/it][A
+ 64%|██████▍   | 80/125 [02:28<01:33,  2.07s/it][A
+ 77%|███████▋  | 96/125 [02:38<00:45,  1.57s/it][A
+ 77%|███████▋  | 96/125 [02:38<00:32,  1.13s/it][A
+ 90%|████████▉ | 112/125 [02:48<00:16,  1.27s/it][A
+ 26%|██▌       | 32/125 [03:06<09:02,  5.83s/it][A
+ 26%|██▌       | 32/125 [03:11<09:14,  5.97s/it][A
+ 38%|███▊      | 48/125 [03:23<04:39,  3.63s/it][A
+ 51%|█████     | 64/125 [03:34<02:30,  2.47s/it][A
+ 64%|██████▍   | 80/125 [03:43<02:16,  3.02s/it][A
+ 77%|███████▋  | 96/125 [03:53<01:45,  3.64s/it][A
+ 64%|██████▍   | 80/125 [03:53<01:30,  2.02s/it][A
+ 90%|████████▉ | 112/125 [04:04<00:34,  2.67s/it][A
+ 77%|███████▋  | 96/125 [04:06<00:46,  1.62s/it][A
+100%|██████████| 125/125 [04:06<00:00,  2.57s/it][A100%|██████████| 125/125 [04:06<00:00,  1.98s/it]
+
+ 90%|████████▉ | 112/125 [04:10<00:34,  2.64s/it][A
+ 90%|████████▉ | 112/125 [04:18<00:17,  1.33s/it][A
+100%|██████████| 125/125 [04:25<00:00,  2.23s/it][A100%|██████████| 125/125 [04:25<00:00,  2.12s/it]
+
+100%|██████████| 125/125 [04:33<00:00,  1.28s/it][A100%|██████████| 125/125 [04:33<00:00,  2.19s/it]
+
+ 38%|███▊      | 48/125 [04:43<07:32,  5.87s/it][A
+ 51%|█████     | 64/125 [04:56<03:57,  3.89s/it][A
+ 64%|██████▍   | 80/125 [05:12<02:07,  2.83s/it][A
+ 77%|███████▋  | 96/125 [05:17<01:55,  3.98s/it][A
+100%|██████████| 125/125 [05:22<00:00,  3.60s/it][A100%|██████████| 125/125 [05:22<00:00,  2.58s/it]
+
+ 77%|███████▋  | 96/125 [05:24<01:01,  2.13s/it][A
+ 90%|████████▉ | 112/125 [05:30<00:38,  2.96s/it][A
+ 90%|████████▉ | 112/125 [05:37<00:22,  1.70s/it][A
+100%|██████████| 125/125 [05:43<00:00,  2.41s/it][A100%|██████████| 125/125 [05:43<00:00,  2.74s/it]
+
+100%|██████████| 125/125 [05:48<00:00,  1.47s/it][A100%|██████████| 125/125 [05:48<00:00,  2.79s/it]
+ 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s] 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s] 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s] 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s] 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s] 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s] 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s]
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [05:48<00:04,  2.87it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:02<00:00,  2.22s/it][A100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.29s/it][A100%|██████████| 1/1 [00:03<00:00,  3.30s/it]
+
+100%|█████���████| 1/1 [00:03<00:00,  3.37s/it][A100%|██████████| 1/1 [00:03<00:00,  3.37s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.10s/it][A100%|██████████| 1/1 [00:04<00:00,  4.10s/it]
+
+100%|██████████| 2/2 [00:05<00:00,  2.87s/it][A100%|██████████| 2/2 [00:05<00:00,  2.87s/it]
+
+100%|██████████| 2/2 [00:07<00:00,  3.95s/it][A100%|██████████| 2/2 [00:07<00:00,  3.95s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.10s/it][A100%|██████████| 2/2 [00:44<00:00, 22.10s/it]
+
+100%|██████████| 2/2 [00:46<00:00, 23.09s/it][A100%|██████████| 2/2 [00:46<00:00, 23.09s/it]
+100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]
+100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]
+[rank3] {'num_prompt_tokens': 10092, 'num_generated_tokens': 17173, 'num_samples': 127, 'runtime': 395.0415842477232, 'samples/s': 0.3214851424865709, 'tokens/s': 43.4713728497786}
+100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]
+[rank4] {'num_prompt_tokens': 8577, 'num_generated_tokens': 17780, 'num_samples': 126, 'runtime': 395.04179600812495, 'samples/s': 0.318953592438124, 'tokens/s': 45.007895821824164}
+100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]
+[rank1] {'num_prompt_tokens': 10289, 'num_generated_tokens': 17324, 'num_samples': 127, 'runtime': 395.0414089541882, 'samples/s': 0.3214852851406466, 'tokens/s': 43.85363054942175}
+[rank5] {'num_prompt_tokens': 8773, 'num_generated_tokens': 15395, 'num_samples': 126, 'runtime': 395.04182332009077, 'samples/s': 0.31895357038666233, 'tokens/s': 38.97055727065609}
+100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]
+100%|██████████| 1012/1012 [06:35<00:00,  2.45it/s]100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]
+[rank7] {'num_prompt_tokens': 8471, 'num_generated_tokens': 18494, 'num_samples': 126, 'runtime': 395.0418896060437, 'samples/s': 0.3189535168679295, 'tokens/s': 46.815288420281654}
+100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]
+[rank2] {'num_prompt_tokens': 9782, 'num_generated_tokens': 17309, 'num_samples': 127, 'runtime': 395.0417209174484, 'samples/s': 0.3214850312646828, 'tokens/s': 43.81562524535744}
+[rank0] {'num_prompt_tokens': 10572, 'num_generated_tokens': 20730, 'num_samples': 127, 'runtime': 395.0563453454524, 'samples/s': 0.32147313034282826, 'tokens/s': 52.473527496116766}
+100%|██████████| 1012/1012 [06:35<00:00,  2.56it/s]
+[rank6] {'num_prompt_tokens': 8775, 'num_generated_tokens': 19962, 'num_samples': 126, 'runtime': 395.0421266518533, 'samples/s': 0.3189533254792407, 'tokens/s': 50.531319708068274}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:37:52.269855
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' th = zh ']'
++ src_lang=th
++ tgt_lang=zh
++ lp=th2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/train.log
+[2025-09-15 18:38:25,782] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:38:32,898] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:38:33,007] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:38:33,047] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:38:33,060] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:38:33,102] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:38:33,119] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:38:33,134] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:38:33,244] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.84s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.22s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.52s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.52s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:38:37.598505
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 161817.53 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5749.52 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:20,  5.38it/s][A
+ 13%|█▎        | 16/125 [00:02<00:20,  5.36it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.09it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.68it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.35it/s][A
+ 13%|█▎        | 16/125 [00:03<00:27,  4.01it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.73it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.25it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.77it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.19it/s][A
+ 13%|█▎        | 16/125 [00:06<00:44,  2.48it/s][A
+ 26%|██▌       | 32/125 [00:06<00:20,  4.56it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.39it/s][A
+ 38%|███▊      | 48/125 [00:08<00:12,  6.18it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.72it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.63it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.61it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.22it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.39it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.76it/s][A
+ 51%|█████     | 64/125 [00:11<00:10,  5.87it/s][A
+ 51%|█████     | 64/125 [00:11<00:11,  5.28it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.26it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.20it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.53it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.86it/s][A
+ 38%|███▊      | 48/125 [00:14<00:21,  3.52it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:08,  5.28it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.10it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.30it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  4.03it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.81it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.33it/s][A
+ 51%|█████     | 64/125 [00:17<00:15,  4.00it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  4.95it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.03it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.19it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.25it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.93it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.16it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.44it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.08it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.44it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:11,  3.79it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.34it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.38it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.97it/s][A100%|██████████| 125/125 [00:23<00:00,  5.33it/s]
+
+100%|██████████| 125/125 [00:23<00:00,  5.15it/s][A100%|██████████| 125/125 [00:23<00:00,  5.28it/s]
+
+100%|██████████| 125/125 [00:24<00:00,  4.90it/s][A100%|██████████| 125/125 [00:24<00:00,  5.16it/s]
+
+ 90%|████████▉ | 112/125 [00:24<00:03,  4.23it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  4.09it/s][A
+100%|██████████| 125/125 [00:25<00:00,  4.69it/s][A100%|██████████| 125/125 [00:25<00:00,  4.87it/s]
+
+ 90%|████████▉ | 112/125 [00:27<00:03,  3.99it/s][A
+100%|██████████| 125/125 [00:27<00:00,  4.19it/s][A100%|██████████| 125/125 [00:27<00:00,  4.48it/s]
+
+ 90%|████████▉ | 112/125 [00:28<00:02,  4.56it/s][A
+100%|██████████| 125/125 [00:30<00:00,  4.16it/s][A100%|██████████| 125/125 [00:30<00:00,  4.10it/s]
+
+100%|██████████| 125/125 [00:31<00:00,  4.43it/s][A100%|██████████| 125/125 [00:31<00:00,  4.00it/s]
+
+ 26%|██▌       | 32/125 [01:43<05:48,  3.75s/it][A
+ 38%|███▊      | 48/125 [01:46<02:43,  2.12s/it][A
+ 51%|█████     | 64/125 [01:48<01:22,  1.35s/it][A
+ 64%|██████▍   | 80/125 [01:52<00:42,  1.06it/s][A
+ 77%|███████▋  | 96/125 [01:55<00:20,  1.44it/s][A
+ 90%|████████▉ | 112/125 [01:58<00:06,  1.91it/s][A
+100%|██████████| 125/125 [02:01<00:00,  2.32it/s][A100%|██████████| 125/125 [02:01<00:00,  1.03it/s]
+ 99%|█████████▉| 1000/1012 [02:01<00:01,  8.25it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.25it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.25it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.25it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.25it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.26it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:01<00:01,  8.24it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.32it/s][A100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.06it/s][A100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.05it/s][A100%|██████████| 1/1 [00:00<00:00,  2.05it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.53it/s][A100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.75it/s][A100%|██████████| 2/2 [00:01<00:00,  1.75it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.42it/s][A100%|██████████| 2/2 [00:01<00:00,  1.42it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.45it/s][A100%|██████████| 2/2 [00:01<00:00,  1.45it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.28it/s][A100%|██████████| 2/2 [00:01<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.21it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.21it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.21it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.21it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.21it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.23it/s]
+[rank7] {'num_prompt_tokens': 16654, 'num_generated_tokens': 3162, 'num_samples': 126, 'runtime': 123.04600406065583, 'samples/s': 1.0240072480361735, 'tokens/s': 25.697705700717304}
+
+[rank4] {'num_prompt_tokens': 17731, 'num_generated_tokens': 3136, 'num_samples': 126, 'runtime': 123.04573278874159, 'samples/s': 1.0240095056065912, 'tokens/s': 25.486458806208493}
+[rank1] {'num_prompt_tokens': 19339, 'num_generated_tokens': 3682, 'num_samples': 127, 'runtime': 123.04556661657989, 'samples/s': 1.0321379590679804, 'tokens/s': 29.923873742427592}
+[rank6] {'num_prompt_tokens': 16331, 'num_generated_tokens': 3381, 'num_samples': 126, 'runtime': 123.04580506123602, 'samples/s': 1.024008904141785, 'tokens/s': 27.477572261137897}
+[rank2] {'num_prompt_tokens': 17613, 'num_generated_tokens': 4386, 'num_samples': 127, 'runtime': 123.04173413850367, 'samples/s': 1.0321701078842132, 'tokens/s': 35.646441678583926}
+100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s][rank3] {'num_prompt_tokens': 18962, 'num_generated_tokens': 3408, 'num_samples': 127, 'runtime': 123.03586869873106, 'samples/s': 1.0322193141170533, 'tokens/s': 27.69923954733006}
+100%|██████████| 1012/1012 [02:03<00:00,  8.21it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]
+[rank0] {'num_prompt_tokens': 19742, 'num_generated_tokens': 3768, 'num_samples': 127, 'runtime': 123.05505439080298, 'samples/s': 1.0320583793061315, 'tokens/s': 30.620440734059088}
+100%|██████████| 1012/1012 [02:03<00:00,  8.22it/s]
+[rank5] {'num_prompt_tokens': 16286, 'num_generated_tokens': 3327, 'num_samples': 126, 'runtime': 123.04614425823092, 'samples/s': 1.0240060812923155, 'tokens/s': 27.038636765551857}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:40:44.631187
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=th
++ lp=zh2th
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/generated_predictions.jsonl
+[2025-09-15 18:41:17,535] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:41:24,450] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:41:24,601] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:41:24,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:41:24,769] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:41:24,969] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:41:25,026] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:41:25,032] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:41:25,048] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.23s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:41:28.994051
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 101865.55 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6157.14 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:05<00:37,  2.89it/s][A
+ 13%|█▎        | 16/125 [00:05<00:40,  2.69it/s][A
+ 13%|█▎        | 16/125 [00:06<00:41,  2.60it/s][A
+ 13%|█▎        | 16/125 [00:06<00:44,  2.46it/s][A
+ 13%|█▎        | 16/125 [00:07<00:51,  2.12it/s][A
+ 13%|█▎        | 16/125 [00:08<00:55,  1.96it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.35it/s][A
+ 13%|█▎        | 16/125 [00:10<01:09,  1.56it/s][A
+ 26%|██▌       | 32/125 [00:11<00:33,  2.74it/s][A
+ 26%|██▌       | 32/125 [00:11<00:34,  2.68it/s][A
+ 26%|██▌       | 32/125 [00:14<00:39,  2.33it/s][A
+ 26%|██▌       | 32/125 [00:15<00:45,  2.06it/s][A
+ 38%|███▊      | 48/125 [00:15<00:25,  3.04it/s][A
+ 26%|██▌       | 32/125 [00:16<00:47,  1.97it/s][A
+ 38%|███▊      | 48/125 [00:16<00:25,  3.03it/s][A
+ 38%|███▊      | 48/125 [00:20<00:34,  2.25it/s][A
+ 38%|███▊      | 48/125 [00:21<00:33,  2.32it/s][A
+ 51%|█████     | 64/125 [00:21<00:19,  3.12it/s][A
+ 26%|██▌       | 32/125 [00:21<01:03,  1.46it/s][A
+ 51%|█████     | 64/125 [00:22<00:22,  2.73it/s][A
+ 38%|███▊      | 48/125 [00:22<00:37,  2.08it/s][A
+ 38%|███▊      | 48/125 [00:23<00:37,  2.05it/s][A
+ 64%|██████▍   | 80/125 [00:27<00:15,  2.93it/s][A
+ 51%|█████     | 64/125 [00:28<00:28,  2.14it/s][A
+ 51%|█████     | 64/125 [00:28<00:26,  2.29it/s][A
+ 51%|█████     | 64/125 [00:30<00:30,  1.99it/s][A
+ 51%|█████     | 64/125 [00:31<00:29,  2.04it/s][A
+ 64%|██████▍   | 80/125 [00:31<00:19,  2.29it/s][A
+ 77%|███████▋  | 96/125 [00:35<00:11,  2.57it/s][A
+ 64%|██████▍   | 80/125 [00:36<00:21,  2.08it/s][A
+ 64%|██████▍   | 80/125 [00:36<00:20,  2.20it/s][A
+ 77%|███████▋  | 96/125 [00:38<00:12,  2.29it/s][A
+ 64%|██████▍   | 80/125 [00:38<00:22,  1.98it/s][A
+ 64%|██████▍   | 80/125 [00:39<00:21,  2.05it/s][A
+ 90%|████████▉ | 112/125 [00:41<00:05,  2.48it/s][A
+ 77%|███████▋  | 96/125 [00:42<00:13,  2.20it/s][A
+ 90%|████████▉ | 112/125 [00:44<00:05,  2.47it/s][A
+ 77%|███████▋  | 96/125 [00:44<00:12,  2.25it/s][A
+ 77%|███████▋  | 96/125 [00:44<00:13,  2.10it/s][A
+100%|██████████| 125/125 [00:49<00:00,  2.41it/s][A100%|██████████| 125/125 [00:49<00:00,  2.51it/s]
+
+100%|██████████| 125/125 [00:49<00:00,  2.18it/s][A100%|██████████| 125/125 [00:49<00:00,  2.51it/s]
+
+ 90%|████████▉ | 112/125 [00:49<00:05,  2.40it/s][A
+ 77%|███████▋  | 96/125 [00:51<00:16,  1.72it/s][A
+ 90%|████████▉ | 112/125 [00:54<00:06,  1.93it/s][A
+100%|██████████| 125/125 [00:54<00:00,  2.47it/s][A100%|██████████| 125/125 [00:54<00:00,  2.28it/s]
+
+ 90%|████████▉ | 112/125 [00:56<00:06,  2.00it/s][A
+100%|██████████| 125/125 [01:01<00:00,  1.92it/s][A100%|██████████| 125/125 [01:01<00:00,  2.04it/s]
+
+100%|██████████| 125/125 [01:03<00:00,  1.98it/s][A100%|██████████| 125/125 [01:03<00:00,  1.97it/s]
+
+ 13%|█▎        | 16/125 [01:30<10:13,  5.63s/it][A
+ 26%|██▌       | 32/125 [01:36<03:57,  2.55s/it][A
+ 38%|███▊      | 48/125 [01:41<01:58,  1.54s/it][A
+ 51%|█████     | 64/125 [01:48<01:06,  1.09s/it][A
+ 38%|███▊      | 48/125 [01:52<03:48,  2.96s/it][A
+ 64%|██████▍   | 80/125 [01:55<00:38,  1.16it/s][A
+ 51%|█████     | 64/125 [02:00<02:01,  1.99s/it][A
+ 77%|███████▋  | 96/125 [02:03<00:21,  1.37it/s][A
+ 64%|██████▍   | 80/125 [02:07<01:04,  1.42s/it][A
+ 90%|████████▉ | 112/125 [02:09<00:08,  1.62it/s][A
+100%|██████████| 125/125 [02:15<00:00,  1.75it/s][A100%|██████████| 125/125 [02:15<00:00,  1.08s/it]
+
+ 90%|████████▉ | 112/125 [02:17<00:29,  2.23s/it][A
+100%|██████████| 125/125 [02:26<00:00,  1.81s/it][A100%|██████████| 125/125 [02:26<00:00,  1.17s/it]
+
+ 77%|███████▋  | 96/125 [03:36<01:22,  2.84s/it][A
+ 90%|████████▉ | 112/125 [03:45<00:27,  2.10s/it][A
+100%|██████████| 125/125 [03:50<00:00,  1.63s/it][A100%|██████████| 125/125 [03:50<00:00,  1.85s/it]
+ 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s] 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s] 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s] 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s] 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s] 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s] 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+
+  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s][A
+[A  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [03:50<00:02,  4.33it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:01<00:00,  1.21s/it][A100%|██████████| 1/1 [00:01<00:00,  1.21s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.41s/it][A100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.59s/it][A100%|██████████| 1/1 [00:01<00:00,  1.59s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.74s/it][A100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.23s/it][A100%|██████████| 2/2 [00:02<00:00,  1.23s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.65s/it][A100%|██████████| 2/2 [00:03<00:00,  1.65s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.73s/it][A100%|██████████| 2/2 [00:03<00:00,  1.73s/it]
+
+100%|██████████| 2/2 [00:04<00:00,  2.34s/it][A100%|██████████| 2/2 [00:04<00:00,  2.34s/it]
+100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]
+
+
+100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]
+100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]
+[rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 9450, 'num_samples': 126, 'runtime': 235.6191403158009, 'samples/s': 0.5347613094213055, 'tokens/s': 40.10709820659791}
+[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 8024, 'num_samples': 126, 'runtime': 235.6194819305092, 'samples/s': 0.5347605340935302, 'tokens/s': 34.05490893306735}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 8551, 'num_samples': 126, 'runtime': 235.62011764012277, 'samples/s': 0.5347590912947748, 'tokens/s': 36.291468171917614}
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 8992, 'num_samples': 127, 'runtime': 235.6199065539986, 'samples/s': 0.5390036939467785, 'tokens/s': 38.16315918086167}
+100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 10119, 'num_samples': 127, 'runtime': 235.61854223534465, 'samples/s': 0.5390068149778621, 'tokens/s': 42.946535124102255}
+100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s]100%|██████████| 1012/1012 [03:55<00:00,  4.28it/s][rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 8844, 'num_samples': 126, 'runtime': 235.62037513963878, 'samples/s': 0.5347585068792415, 'tokens/s': 37.534954244762005}
+100%|██████████| 1012/1012 [03:55<00:00,  4.29it/s]
+100%|██████████| 1012/1012 [03:55<00:00,  4.30it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 11263, 'num_samples': 127, 'runtime': 235.62899621203542, 'samples/s': 0.5389829012627824, 'tokens/s': 47.79971981828911}
+[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 8653, 'num_samples': 127, 'runtime': 235.62029391899705, 'samples/s': 0.5390028078127295, 'tokens/s': 36.72434091341376}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:45:28.462337
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' jv = zh ']'
++ src_lang=jv
++ tgt_lang=zh
++ lp=jv2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/train.log
+[2025-09-15 18:46:03,033] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:46:09,809] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:46:10,187] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:46:10,249] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:46:10,378] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:46:10,413] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:46:10,422] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:46:10,464] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:46:10,482] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] attn_impl: flash_attn
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.97s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:46:14.557672
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 221941.73 examples/s]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5886.55 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A[A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.85it/s][A
+ 13%|█▎        | 16/125 [00:02<00:20,  5.44it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.32it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.19it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.74it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.37it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.24it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.91it/s][A
+ 26%|██▌       | 32/125 [00:04<00:13,  7.06it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.33it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.06it/s][A
+ 26%|██▌       | 32/125 [00:06<00:16,  5.49it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.69it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.46it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.06it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.75it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.90it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.64it/s][A
+ 38%|███▊      | 48/125 [00:09<00:17,  4.52it/s][A
+ 38%|███▊      | 48/125 [00:10<00:15,  4.84it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.51it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.11it/s][A
+ 51%|█████     | 64/125 [00:10<00:10,  5.81it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.31it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.24it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  5.03it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.81it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.52it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.53it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:08,  5.11it/s][A
+ 51%|█████     | 64/125 [00:15<00:13,  4.43it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:09,  4.95it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.76it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.33it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:09,  4.60it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:04,  5.82it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.03it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.29it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  6.08it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.32it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  6.49it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.84it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.27it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.57it/s][A
+100%|██████████| 125/125 [00:21<00:00,  6.55it/s][A100%|██████████| 125/125 [00:21<00:00,  5.85it/s]
+
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.74it/s][A
+100%|██████████| 125/125 [00:21<00:00,  5.29it/s][A100%|██████████| 125/125 [00:21<00:00,  5.69it/s]
+
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.86it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.71it/s][A
+100%|██████████| 125/125 [00:23<00:00,  4.89it/s][A100%|██████████| 125/125 [00:23<00:00,  5.25it/s]
+
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.56it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.44it/s][A
+ 90%|████████▉ | 112/125 [00:26<00:03,  3.96it/s][A
+100%|██████████| 125/125 [00:27<00:00,  4.23it/s][A100%|██████████| 125/125 [00:27<00:00,  4.61it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.56it/s][A100%|██████████| 125/125 [00:28<00:00,  4.42it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.39it/s][A100%|██████████| 125/125 [00:28<00:00,  4.35it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.03it/s][A100%|██████████| 125/125 [00:29<00:00,  4.29it/s]
+
+ 26%|██▌       | 32/125 [01:39<05:35,  3.60s/it][A
+ 38%|███▊      | 48/125 [01:42<02:37,  2.05s/it][A
+ 51%|█████     | 64/125 [01:44<01:19,  1.30s/it][A
+ 64%|██████▍   | 80/125 [01:47<00:40,  1.10it/s][A
+ 77%|███████▋  | 96/125 [01:51<00:19,  1.49it/s][A
+ 90%|████████▉ | 112/125 [01:53<00:06,  2.00it/s][A
+100%|██████████| 125/125 [03:16<00:00,  2.12s/it][A100%|██████████| 125/125 [03:16<00:00,  1.58s/it]
+ 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s] 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s] 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s] 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s] 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s] 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s] 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [03:17<00:02,  5.07it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.13it/s][A100%|██████████| 1/1 [00:00<00:00,  2.12it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.01it/s][A100%|██████████| 1/1 [00:00<00:00,  2.01it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.64it/s][A100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.02s/it][A100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
+
+100%|██████████| 2/2 [00:01<00:00,  1.75it/s][A100%|██████████| 2/2 [00:01<00:00,  1.75it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.46it/s][A100%|██████████| 2/2 [00:01<00:00,  1.46it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.36it/s][A100%|██████████| 2/2 [00:01<00:00,  1.36it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.40it/s][A100%|██████████| 2/2 [00:01<00:00,  1.40it/s]
+100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]
+100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]
+100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]
+100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]
+
+100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s][rank1] {'num_prompt_tokens': 12953, 'num_generated_tokens': 3590, 'num_samples': 127, 'runtime': 198.72497743740678, 'samples/s': 0.639074169929151, 'tokens/s': 18.065167480674425}
+[rank6] {'num_prompt_tokens': 11871, 'num_generated_tokens': 3406, 'num_samples': 126, 'runtime': 198.72844148986042, 'samples/s': 0.634031037809094, 'tokens/s': 17.138965990299795}
+[rank5] {'num_prompt_tokens': 12413, 'num_generated_tokens': 3483, 'num_samples': 126, 'runtime': 198.72830202803016, 'samples/s': 0.634031482753916, 'tokens/s': 17.52644170184039}
+100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s][rank2] {'num_prompt_tokens': 13763, 'num_generated_tokens': 5468, 'num_samples': 127, 'runtime': 198.72835976071656, 'samples/s': 0.6390632929941015, 'tokens/s': 27.514945559777534}
+[rank7] {'num_prompt_tokens': 12150, 'num_generated_tokens': 3223, 'num_samples': 126, 'runtime': 198.72915860265493, 'samples/s': 0.6340287499124786, 'tokens/s': 16.21805286482475}
+
+100%|██████████| 1012/1012 [03:18<00:00,  5.10it/s]100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]
+[rank4] {'num_prompt_tokens': 13171, 'num_generated_tokens': 3172, 'num_samples': 126, 'runtime': 198.72913489677012, 'samples/s': 0.6340288255441293, 'tokens/s': 15.96142408433316}
+100%|██████████| 1012/1012 [03:18<00:00,  5.09it/s]
+[rank3] {'num_prompt_tokens': 13300, 'num_generated_tokens': 3516, 'num_samples': 127, 'runtime': 198.72921881265938, 'samples/s': 0.639060530498648, 'tokens/s': 17.692415946718477}
+[rank0] {'num_prompt_tokens': 14139, 'num_generated_tokens': 3839, 'num_samples': 127, 'runtime': 198.7427084352821, 'samples/s': 0.6390171543896205, 'tokens/s': 19.316431934659473}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:49:37.740456
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=jv
++ lp=zh2jv
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/train.log
+[2025-09-15 18:50:10,619] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:50:17,513] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:50:17,626] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:50:17,690] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:50:17,815] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:50:17,931] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:50:18,023] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:50:18,054] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:50:18,063] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.30s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.38s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|��█████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:50:21.932410
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 132761.03 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6364.72 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s]
+[A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A[A
+ 13%|█▎        | 16/125 [00:04<00:31,  3.41it/s][A
+ 13%|█▎        | 16/125 [00:05<00:34,  3.18it/s][A
+ 13%|█▎        | 16/125 [00:05<00:36,  2.96it/s][A
+ 13%|█▎        | 16/125 [00:05<00:39,  2.77it/s][A
+ 26%|██▌       | 32/125 [00:08<00:22,  4.10it/s][A
+ 26%|██▌       | 32/125 [00:08<00:23,  3.93it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.37it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.55it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.62it/s][A
+ 38%|███▊      | 48/125 [00:14<00:24,  3.16it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.24it/s][A
+ 13%|█▎        | 16/125 [01:32<10:27,  5.76s/it][A
+ 13%|█▎        | 16/125 [01:32<10:33,  5.81s/it][A
+ 13%|█▎        | 16/125 [01:34<10:44,  5.91s/it][A
+ 13%|█▎        | 16/125 [01:36<10:54,  6.01s/it][A
+ 26%|██▌       | 32/125 [01:36<03:55,  2.53s/it][A
+ 26%|██▌       | 32/125 [01:36<03:56,  2.54s/it][A
+ 38%|███▊      | 48/125 [01:41<01:56,  1.51s/it][A
+ 26%|██▌       | 32/125 [01:42<05:44,  3.70s/it][A
+ 26%|██▌       | 32/125 [01:43<04:15,  2.75s/it][A
+ 51%|█████     | 64/125 [01:45<02:27,  2.42s/it][A
+ 38%|███▊      | 48/125 [01:46<02:44,  2.13s/it][A
+ 38%|███▊      | 48/125 [01:47<02:02,  1.60s/it][A
+ 51%|█████     | 64/125 [01:47<02:31,  2.48s/it][A
+ 51%|█████     | 64/125 [01:51<01:26,  1.41s/it][A
+ 64%|██████▍   | 80/125 [01:51<01:42,  2.29s/it][A
+ 51%|█████     | 64/125 [01:52<01:06,  1.10s/it][A
+ 77%|███████▋  | 96/125 [01:56<00:46,  1.61s/it][A
+ 64%|██████▍   | 80/125 [01:56<00:45,  1.01s/it][A
+ 64%|██████▍   | 80/125 [01:57<00:36,  1.22it/s][A
+ 77%|███████▋  | 96/125 [02:02<00:22,  1.26it/s][A
+ 38%|███▊      | 48/125 [03:07<05:04,  3.96s/it][A
+ 26%|██▌       | 32/125 [03:08<09:07,  5.89s/it][A
+ 51%|█████     | 64/125 [03:12<03:12,  3.16s/it][A
+ 38%|███▊      | 48/125 [03:13<04:18,  3.35s/it][A
+ 64%|██████▍   | 80/125 [03:16<02:42,  3.60s/it][A
+ 51%|█████     | 64/125 [03:18<02:11,  2.15s/it][A
+ 64%|██████▍   | 80/125 [03:21<02:47,  3.71s/it][A
+ 77%|███████▋  | 96/125 [03:25<01:13,  2.54s/it][A
+ 90%|████████▉ | 112/125 [03:27<00:38,  2.94s/it][A
+ 77%|███████▋  | 96/125 [03:29<01:12,  2.50s/it][A
+100%|██████████| 125/125 [03:31<00:00,  2.22s/it][A100%|██████████| 125/125 [03:31<00:00,  1.69s/it]
+
+ 90%|████████▉ | 112/125 [03:34<00:31,  2.41s/it][A
+ 90%|████████▉ | 112/125 [03:34<00:23,  1.78s/it][A
+100%|██████████| 125/125 [03:38<00:00,  1.38s/it][A100%|██████████| 125/125 [03:38<00:00,  1.75s/it]
+
+ 51%|█████     | 64/125 [04:39<04:45,  4.68s/it][A
+ 64%|██████▍   | 80/125 [04:43<03:03,  4.07s/it][A
+ 64%|██████▍   | 80/125 [04:44<02:19,  3.10s/it][A
+ 77%|███████▋  | 96/125 [04:47<02:05,  4.32s/it][A
+ 77%|███████▋  | 96/125 [04:48<01:01,  2.14s/it][A
+ 77%|███████▋  | 96/125 [04:49<01:21,  2.81s/it][A
+ 64%|██████▍   | 80/125 [04:52<02:36,  3.48s/it][A
+100%|██████████| 125/125 [04:55<00:00,  3.46s/it][A100%|██████████| 125/125 [04:55<00:00,  2.36s/it]
+
+ 90%|████████▉ | 112/125 [04:59<00:47,  3.62s/it][A
+ 90%|████████▉ | 112/125 [06:18<01:01,  4.75s/it][A
+ 90%|████████▉ | 112/125 [06:18<00:48,  3.72s/it][A
+ 90%|████████▉ | 112/125 [06:18<00:42,  3.27s/it][A
+100%|██████████| 125/125 [06:20<00:00,  4.33s/it][A100%|██████████| 125/125 [06:20<00:00,  3.04s/it]
+
+100%|██████████| 125/125 [06:22<00:00,  2.45s/it][A100%|██████████| 125/125 [06:22<00:00,  3.06s/it]
+
+100%|██████████| 125/125 [06:23<00:00,  3.56s/it][A100%|██████████| 125/125 [06:23<00:00,  3.07s/it]
+
+ 77%|███████▋  | 96/125 [06:25<02:03,  4.28s/it][A
+100%|██████████| 125/125 [07:37<00:00,  4.37s/it][A100%|██████████| 125/125 [07:37<00:00,  3.66s/it]
+
+ 90%|████████▉ | 112/125 [08:01<01:02,  4.84s/it][A
+100%|██████████| 125/125 [08:05<00:00,  3.60s/it][A100%|██████████| 125/125 [08:05<00:00,  3.88s/it]
+ 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s] 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s] 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s] 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s] 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s] 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s] 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [08:05<00:05,  2.06it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.16it/s][A100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.09it/s][A100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.02it/s][A100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
+
+100%|��█████████| 1/1 [00:01<00:00,  1.15s/it][A100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.15s/it][A100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.41s/it][A100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.41s/it][A100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.45s/it][A100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
+100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]
+100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]
+100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]
+
+
+100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s][rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 11777, 'num_samples': 127, 'runtime': 488.62182304263115, 'samples/s': 0.2599147111546828, 'tokens/s': 24.10248467140708}
+
+100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]
+[rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 12838, 'num_samples': 126, 'runtime': 488.6218945365399, 'samples/s': 0.2578681008952977, 'tokens/s': 26.27389428010978}
+[rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 10165, 'num_samples': 126, 'runtime': 488.63368498161435, 'samples/s': 0.25786187868881977, 'tokens/s': 20.802904737078197}
+[rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 11956, 'num_samples': 126, 'runtime': 488.63352395407856, 'samples/s': 0.25786196366633535, 'tokens/s': 24.468235219005596}
+[rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 10715, 'num_samples': 127, 'runtime': 488.63369226641953, 'samples/s': 0.2599083976607068, 'tokens/s': 21.928491975861995}
+[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 15034, 'num_samples': 127, 'runtime': 488.6339204479009, 'samples/s': 0.2599082762891018, 'tokens/s': 30.767409651420127}
+[rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 7918, 'num_samples': 126, 'runtime': 488.6335643827915, 'samples/s': 0.2578619423312735, 'tokens/s': 16.20437189983352}
+100%|██████████| 1012/1012 [08:08<00:00,  2.08it/s]100%|██████████| 1012/1012 [08:08<00:00,  2.07it/s]
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 11288, 'num_samples': 127, 'runtime': 488.642715068534, 'samples/s': 0.2599035984444949, 'tokens/s': 23.100722986153215}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 18:58:34.764311
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' sw = zh ']'
++ src_lang=sw
++ tgt_lang=zh
++ lp=sw2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/train.log
+[2025-09-15 18:59:07,939] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 18:59:14,907] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:59:15,072] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:59:15,088] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 18:59:15,147] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:59:15,232] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 18:59:15,241] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:59:15,322] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 18:59:15,331] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.39s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.05s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 18:59:19.430629
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 198998.39 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5622.14 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  6.05it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.47it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.22it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.06it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.85it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.22it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.78it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.70it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.92it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.84it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.67it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.72it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.30it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.24it/s][A
+ 26%|██▌       | 32/125 [00:07<00:23,  3.94it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.07it/s][A
+ 38%|███▊      | 48/125 [00:08<00:12,  6.41it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.68it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  5.13it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  4.91it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  5.01it/s][A
+ 38%|███▊      | 48/125 [00:10<00:15,  4.93it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.74it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.14it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.36it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  5.07it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  5.03it/s][A
+ 51%|█████     | 64/125 [00:12<00:10,  5.61it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  4.85it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.52it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.96it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.82it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.74it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:07,  5.81it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.11it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.11it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.04it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.59it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.40it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:09,  4.60it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.79it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.42it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.34it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.96it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  5.61it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.98it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.97it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.53it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.30it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.49it/s][A
+100%|██████████| 125/125 [00:22<00:00,  5.54it/s][A100%|██████████| 125/125 [00:22<00:00,  5.44it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.36it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  5.12it/s][A
+100%|██████████| 125/125 [00:23<00:00,  4.83it/s][A100%|██████████| 125/125 [00:23<00:00,  5.33it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.92it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.84it/s][A100%|██████████| 125/125 [00:23<00:00,  5.27it/s]
+
+100%|██████████| 125/125 [00:24<00:00,  5.13it/s][A100%|██████████| 125/125 [00:24<00:00,  5.16it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.41it/s][A100%|██████████| 125/125 [00:26<00:00,  4.80it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.95it/s][A100%|██████████| 125/125 [00:26<00:00,  4.76it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.73it/s][A100%|██████████| 125/125 [00:26<00:00,  4.72it/s]
+
+ 64%|██████▍   | 80/125 [01:50<01:45,  2.34s/it][A
+ 77%|███████▋  | 96/125 [01:53<00:46,  1.61s/it][A
+ 90%|████████▉ | 112/125 [01:57<00:14,  1.15s/it][A
+100%|██████████| 125/125 [02:01<00:00,  1.08it/s][A100%|██████████| 125/125 [02:01<00:00,  1.03it/s]
+ 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s] 99%|█████████▉| 1000/1012 [02:01<00:01,  8.23it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:01<00:01,  8.21it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.14it/s][A100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.63it/s][A100%|██████████| 1/1 [00:00<00:00,  1.63it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.37it/s][A100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.13it/s][A100%|██████████| 1/1 [00:00<00:00,  1.13it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.50it/s][A100%|██████████| 2/2 [00:01<00:00,  1.50it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.43it/s][A100%|██████████| 2/2 [00:01<00:00,  1.43it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.19it/s][A100%|██████████| 2/2 [00:01<00:00,  1.19it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.25it/s][A100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]
+
+100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]
+[rank1] {'num_prompt_tokens': 14527, 'num_generated_tokens': 4622, 'num_samples': 127, 'runtime': 123.38878940418363, 'samples/s': 1.0292669262195868, 'tokens/s': 37.458832543204174}
+
+100%|██████████| 1012/1012 [02:03<00:00,  8.19it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]
+[rank3] {'num_prompt_tokens': 14180, 'num_generated_tokens': 3530, 'num_samples': 127, 'runtime': 123.39603731781244, 'samples/s': 1.0292064701632628, 'tokens/s': 28.607077477766282}
+[rank2] {'num_prompt_tokens': 15360, 'num_generated_tokens': 3443, 'num_samples': 127, 'runtime': 123.39564071781933, 'samples/s': 1.0292097780862706, 'tokens/s': 27.902120204338814}
+[rank6] {'num_prompt_tokens': 13761, 'num_generated_tokens': 3397, 'num_samples': 126, 'runtime': 123.39609559625387, 'samples/s': 1.021101999955217, 'tokens/s': 27.52923407815772}
+[rank4] {'num_prompt_tokens': 14537, 'num_generated_tokens': 3185, 'num_samples': 126, 'runtime': 123.39463981427252, 'samples/s': 1.0211140466850823, 'tokens/s': 25.811493957872916}
+[rank5] {'num_prompt_tokens': 13718, 'num_generated_tokens': 3446, 'num_samples': 126, 'runtime': 123.3962887134403, 'samples/s': 1.0211004019140821, 'tokens/s': 27.92628559520577}
+100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]
+100%|██████████| 1012/1012 [02:03<00:00,  8.20it/s]
+[rank7] {'num_prompt_tokens': 12607, 'num_generated_tokens': 3102, 'num_samples': 126, 'runtime': 123.39641184732318, 'samples/s': 1.021099382986097, 'tokens/s': 25.138494333514863}
+[rank0] {'num_prompt_tokens': 15593, 'num_generated_tokens': 3672, 'num_samples': 127, 'runtime': 123.40467374026775, 'samples/s': 1.029134441595781, 'tokens/s': 29.755761177478014}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 19:01:26.382119
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=sw
++ lp=zh2sw
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/train.log
+[2025-09-15 19:01:59,234] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 19:02:06,189] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:02:06,282] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:02:06,330] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:02:06,494] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:02:06,626] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:02:06,666] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:02:06,720] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:02:06,727] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.33s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.51s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.51s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.39s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 19:02:10.630264
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 133365.87 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5725.80 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s]
+[A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [01:32<10:28,  5.77s/it][A
+ 13%|█▎        | 16/125 [01:32<10:32,  5.80s/it][A
+ 13%|█▎        | 16/125 [01:33<10:38,  5.86s/it][A
+ 13%|█▎        | 16/125 [01:33<10:39,  5.87s/it][A
+ 13%|█▎        | 16/125 [01:33<10:39,  5.87s/it][A
+ 13%|█▎        | 16/125 [01:34<10:42,  5.89s/it][A
+ 13%|█▎        | 16/125 [01:34<10:43,  5.90s/it][A
+ 13%|█▎        | 16/125 [01:36<10:59,  6.05s/it][A
+ 26%|██▌       | 32/125 [03:02<08:48,  5.69s/it][A
+ 26%|██▌       | 32/125 [03:04<08:55,  5.75s/it][A
+ 26%|██▌       | 32/125 [03:04<08:55,  5.76s/it][A
+ 26%|██▌       | 32/125 [03:06<09:02,  5.83s/it][A
+ 26%|██▌       | 32/125 [03:07<09:04,  5.85s/it][A
+ 26%|██▌       | 32/125 [03:08<09:07,  5.89s/it][A
+ 26%|██▌       | 32/125 [03:09<09:11,  5.93s/it][A
+ 26%|██▌       | 32/125 [03:12<09:17,  5.99s/it][A
+ 38%|███▊      | 48/125 [04:32<07:15,  5.66s/it][A
+ 38%|███▊      | 48/125 [04:35<07:20,  5.72s/it][A
+ 38%|███▊      | 48/125 [04:35<07:20,  5.72s/it][A
+ 38%|███▊      | 48/125 [04:40<07:29,  5.84s/it][A
+ 38%|███▊      | 48/125 [04:41<07:30,  5.85s/it][A
+ 38%|███▊      | 48/125 [04:41<07:32,  5.87s/it][A
+ 38%|███▊      | 48/125 [04:42<07:31,  5.87s/it][A
+ 38%|███▊      | 48/125 [04:44<07:34,  5.91s/it][A
+ 51%|█████     | 64/125 [06:03<05:46,  5.67s/it][A
+ 51%|█████     | 64/125 [06:06<05:48,  5.71s/it][A
+ 51%|█████     | 64/125 [06:07<05:50,  5.74s/it][A
+ 51%|█████     | 64/125 [06:13<05:55,  5.83s/it][A
+ 51%|█████     | 64/125 [06:13<05:54,  5.82s/it][A
+ 51%|█████     | 64/125 [06:13<05:54,  5.81s/it][A
+ 51%|█████     | 64/125 [06:14<05:56,  5.85s/it][A
+ 51%|█████     | 64/125 [06:20<06:01,  5.93s/it][A
+ 64%|██████▍   | 80/125 [07:34<04:15,  5.68s/it][A
+ 64%|██████▍   | 80/125 [07:37<04:16,  5.70s/it][A
+ 64%|██████▍   | 80/125 [07:39<04:18,  5.74s/it][A
+ 64%|██████▍   | 80/125 [07:46<04:20,  5.80s/it][A
+ 64%|██████▍   | 80/125 [07:46<04:21,  5.82s/it][A
+ 64%|██████▍   | 80/125 [07:47<04:22,  5.83s/it][A
+ 64%|██████▍   | 80/125 [07:49<04:24,  5.88s/it][A
+ 64%|██████▍   | 80/125 [07:55<04:26,  5.93s/it][A
+ 77%|███████▋  | 96/125 [09:06<02:45,  5.70s/it][A
+ 77%|███████▋  | 96/125 [09:08<02:45,  5.71s/it][A
+ 77%|███████▋  | 96/125 [09:09<02:45,  5.70s/it][A
+ 77%|███████▋  | 96/125 [09:18<02:48,  5.80s/it][A
+ 77%|███████▋  | 96/125 [09:18<02:48,  5.80s/it][A
+ 77%|███████▋  | 96/125 [09:20<02:49,  5.84s/it][A
+ 77%|███████▋  | 96/125 [09:21<02:49,  5.84s/it][A
+ 77%|███████▋  | 96/125 [09:28<02:51,  5.90s/it][A
+ 90%|████████▉ | 112/125 [10:37<01:14,  5.69s/it][A
+ 90%|████████▉ | 112/125 [10:38<01:13,  5.67s/it][A
+ 90%|████████▉ | 112/125 [10:39<01:13,  5.67s/it][A
+ 90%|████████▉ | 112/125 [10:50<01:15,  5.78s/it][A
+ 90%|████████▉ | 112/125 [10:53<01:15,  5.85s/it][A
+ 90%|████████▉ | 112/125 [10:54<01:15,  5.83s/it][A
+ 90%|████████▉ | 112/125 [10:56<01:16,  5.87s/it][A
+ 90%|████████▉ | 112/125 [11:03<01:16,  5.92s/it][A
+100%|██████████| 125/125 [11:56<00:00,  5.81s/it][A100%|██████████| 125/125 [11:56<00:00,  5.73s/it]
+
+100%|██████████| 125/125 [11:56<00:00,  5.78s/it][A100%|██████████| 125/125 [11:56<00:00,  5.74s/it]
+
+100%|██████████| 125/125 [11:57<00:00,  5.77s/it][A100%|██████████| 125/125 [11:57<00:00,  5.74s/it]
+
+100%|██████████| 125/125 [12:10<00:00,  5.88s/it][A100%|██████████| 125/125 [12:10<00:00,  5.85s/it]
+
+100%|██████████| 125/125 [12:13<00:00,  5.93s/it][A100%|██████████| 125/125 [12:13<00:00,  5.87s/it]
+
+100%|██████████| 125/125 [12:15<00:00,  5.96s/it][A100%|██████████| 125/125 [12:15<00:00,  5.89s/it]
+
+100%|██████████| 125/125 [12:18<00:00,  5.99s/it][A100%|██████████| 125/125 [12:18<00:00,  5.91s/it]
+
+100%|██████████| 125/125 [12:26<00:00,  6.04s/it][A100%|██████████| 125/125 [12:26<00:00,  5.97s/it]
+ 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s] 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s] 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s] 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s] 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s] 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s] 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [12:26<00:08,  1.34it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:01<00:00,  1.01s/it][A100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.24s/it][A100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.67s/it][A100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.75s/it][A100%|██████████| 2/2 [00:03<00:00,  1.75s/it]
+
+100%|██████████| 1/1 [00:25<00:00, 25.93s/it][A100%|██████████| 1/1 [00:25<00:00, 25.93s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.13s/it][A100%|██████████| 2/2 [00:44<00:00, 22.13s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.48s/it][A100%|██████████| 2/2 [00:44<00:00, 22.48s/it]
+
+100%|██████████| 2/2 [00:46<00:00, 23.39s/it][A100%|██████████| 2/2 [00:46<00:00, 23.39s/it]
+100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s]100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s]100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s]100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s][rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 35240, 'num_samples': 127, 'runtime': 793.4123119395226, 'samples/s': 0.16006809837566585, 'tokens/s': 44.41574635242886}
+100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+[rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 36084, 'num_samples': 127, 'runtime': 793.409146066755, 'samples/s': 0.16006873708173083, 'tokens/s': 45.479687471316346}
+[rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 37894, 'num_samples': 127, 'runtime': 793.412037268281, 'samples/s': 0.1600681537896264, 'tokens/s': 47.760808029166164}
+100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s]100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s][rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 32442, 'num_samples': 126, 'runtime': 793.4127155467868, 'samples/s': 0.15880763886316857, 'tokens/s': 40.88918587300726}
+100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s][rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 30734, 'num_samples': 126, 'runtime': 793.4128247015178, 'samples/s': 0.15880761701501517, 'tokens/s': 38.736454772535524}
+[rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 25520, 'num_samples': 126, 'runtime': 793.4130516331643, 'samples/s': 0.15880757159293152, 'tokens/s': 32.16483513533026}
+100%|██████████| 1012/1012 [13:13<00:00,  1.25it/s]100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 36288, 'num_samples': 127, 'runtime': 793.4214985501021, 'samples/s': 0.16006624503127242, 'tokens/s': 45.73609369838436}
+100%|██████████| 1012/1012 [13:13<00:00,  1.28it/s]
+[rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 35403, 'num_samples': 126, 'runtime': 793.4133041799068, 'samples/s': 0.1588075210438234, 'tokens/s': 44.621132281860945}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 19:15:27.965943
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' si = zh ']'
++ src_lang=si
++ tgt_lang=zh
++ lp=si2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/train.log
+[2025-09-15 19:16:00,974] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 19:16:07,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:16:08,006] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:16:08,056] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:16:08,124] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:16:08,360] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:16:08,455] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:16:08,458] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:16:08,466] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.08s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.38s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 19:16:12.482100
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 138023.47 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6423.22 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s]
+[A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.40it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.24it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.20it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.05it/s][A
+ 13%|█▎        | 16/125 [00:03<00:27,  4.03it/s][A
+ 13%|█▎        | 16/125 [00:05<00:37,  2.88it/s][A
+ 13%|█▎        | 16/125 [00:05<00:38,  2.85it/s][A
+ 13%|█▎        | 16/125 [00:06<00:46,  2.32it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.73it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.50it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.19it/s][A
+ 26%|██▌       | 32/125 [00:08<00:23,  3.94it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.42it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  4.95it/s][A
+ 26%|██▌       | 32/125 [00:10<00:28,  3.21it/s][A
+ 26%|██▌       | 32/125 [00:10<00:30,  3.02it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.51it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.91it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.98it/s][A
+ 26%|██▌       | 32/125 [00:12<00:37,  2.51it/s][A
+ 38%|███▊      | 48/125 [00:13<00:23,  3.32it/s][A
+ 38%|███▊      | 48/125 [00:14<00:22,  3.36it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.33it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.23it/s][A
+ 38%|███▊      | 48/125 [00:15<00:24,  3.15it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.26it/s][A
+ 38%|███▊      | 48/125 [00:16<00:23,  3.27it/s][A
+ 51%|█████     | 64/125 [00:17<00:15,  3.87it/s][A
+ 51%|█████     | 64/125 [00:18<00:19,  3.18it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.29it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  3.89it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  4.06it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.33it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  3.80it/s][A
+ 51%|█████     | 64/125 [00:21<00:19,  3.10it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.82it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.50it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.75it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.76it/s][A
+ 64%|██████▍   | 80/125 [00:26<00:15,  2.84it/s][A
+ 64%|██████▍   | 80/125 [00:26<00:16,  2.65it/s][A
+ 64%|██████▍   | 80/125 [00:26<00:14,  3.10it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:03,  3.96it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  3.89it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  3.75it/s][A
+ 77%|███████▋  | 96/125 [00:28<00:08,  3.39it/s][A
+ 77%|███████▋  | 96/125 [00:29<00:08,  3.33it/s][A
+ 77%|███████▋  | 96/125 [00:30<00:08,  3.49it/s][A
+ 77%|███████▋  | 96/125 [00:30<00:09,  3.04it/s][A
+100%|██████████| 125/125 [00:30<00:00,  4.02it/s][A100%|██████████| 125/125 [00:30<00:00,  4.11it/s]
+
+ 90%|████████▉ | 112/125 [00:31<00:03,  3.84it/s][A
+100%|██████████| 125/125 [00:32<00:00,  3.58it/s][A100%|██████████| 125/125 [00:32<00:00,  3.85it/s]
+
+100%|██████████| 125/125 [00:32<00:00,  3.64it/s][A100%|██████████| 125/125 [00:32<00:00,  3.83it/s]
+
+100%|██████████| 125/125 [00:34<00:00,  4.22it/s][A100%|██████████| 125/125 [00:34<00:00,  3.65it/s]
+
+ 90%|████████▉ | 112/125 [00:35<00:04,  3.08it/s][A
+ 90%|████████▉ | 112/125 [00:36<00:04,  2.89it/s][A
+ 90%|████████▉ | 112/125 [00:37<00:04,  3.01it/s][A
+100%|██████████| 125/125 [00:40<00:00,  2.98it/s][A100%|██████████| 125/125 [00:40<00:00,  3.11it/s]
+
+100%|██████████| 125/125 [00:40<00:00,  2.96it/s][A100%|██████████| 125/125 [00:40<00:00,  3.10it/s]
+
+100%|██████████| 125/125 [00:40<00:00,  3.14it/s][A100%|██████████| 125/125 [00:40<00:00,  3.07it/s]
+
+ 64%|██████▍   | 80/125 [02:08<01:59,  2.66s/it][A
+ 77%|███████▋  | 96/125 [02:14<00:55,  1.90s/it][A
+ 90%|████████▉ | 112/125 [02:18<00:17,  1.35s/it][A
+100%|██████████| 125/125 [02:23<00:00,  1.08s/it][A100%|██████████| 125/125 [02:23<00:00,  1.15s/it]
+ 99%|█████████▉| 1000/1012 [02:23<00:01,  6.98it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.98it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.98it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.98it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.97it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.96it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.96it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.96it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.49it/s][A100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.14it/s][A100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.03it/s][A100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.02s/it][A100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
+
+100%|██████████| 2/2 [00:01<00:00,  1.66it/s][A100%|██████████| 2/2 [00:01<00:00,  1.66it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.30it/s][A100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.63it/s][A100%|██████████| 2/2 [00:01<00:00,  1.63it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.37it/s][A100%|██████████| 2/2 [00:01<00:00,  1.37it/s]
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.98it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.98it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]
+
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]
+
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]
+[rank7] {'num_prompt_tokens': 38819, 'num_generated_tokens': 3280, 'num_samples': 126, 'runtime': 145.17015904188156, 'samples/s': 0.8679469722399975, 'tokens/s': 22.59417515037454}[rank1] {'num_prompt_tokens': 47254, 'num_generated_tokens': 3712, 'num_samples': 127, 'runtime': 145.1670600809157, 'samples/s': 0.8748541158663031, 'tokens/s': 25.570539197604074}
+
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s][rank2] {'num_prompt_tokens': 44528, 'num_generated_tokens': 4675, 'num_samples': 127, 'runtime': 145.170135486871, 'samples/s': 0.8748355822226653, 'tokens/s': 32.203593282605986}[rank3] {'num_prompt_tokens': 43051, 'num_generated_tokens': 3678, 'num_samples': 127, 'runtime': 145.16983710974455, 'samples/s': 0.8748373803298503, 'tokens/s': 25.335841613017237}
+
+[rank4] {'num_prompt_tokens': 44578, 'num_generated_tokens': 3194, 'num_samples': 126, 'runtime': 145.1690885834396, 'samples/s': 0.8679533723708566, 'tokens/s': 22.001929137718378}
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s][rank5] {'num_prompt_tokens': 39718, 'num_generated_tokens': 3270, 'num_samples': 126, 'runtime': 145.16934940218925, 'samples/s': 0.8679518129610068, 'tokens/s': 22.525416098273745}
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]
+100%|██████████| 1012/1012 [02:25<00:00,  6.97it/s]
+[rank0] {'num_prompt_tokens': 47610, 'num_generated_tokens': 3993, 'num_samples': 127, 'runtime': 145.17939856275916, 'samples/s': 0.8747797639146407, 'tokens/s': 27.503902341032763}
+[rank6] {'num_prompt_tokens': 39153, 'num_generated_tokens': 3531, 'num_samples': 126, 'runtime': 145.17095468379557, 'samples/s': 0.8679422152623242, 'tokens/s': 24.323047318184656}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 19:18:41.306944
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/hypo.si2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/hypo.si2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=si
++ lp=zh2si
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/train.log
+[2025-09-15 19:19:14,194] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 19:19:21,302] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:19:21,305] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:19:21,370] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:19:21,481] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:19:21,546] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:19:21,633] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:19:21,685] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:19:21,686] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.83s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.03s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.30s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.47s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.48s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 19:19:25.837836
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 177644.41 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6024.98 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s]
+[A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [01:31<10:25,  5.74s/it][A
+ 13%|█▎        | 16/125 [01:32<10:28,  5.76s/it][A
+ 13%|█▎        | 16/125 [01:32<10:33,  5.81s/it][A
+ 13%|█▎        | 16/125 [01:33<10:34,  5.82s/it][A
+ 13%|█▎        | 16/125 [01:33<10:35,  5.83s/it][A
+ 13%|█▎        | 16/125 [01:33<10:37,  5.85s/it][A
+ 13%|█▎        | 16/125 [01:33<10:37,  5.85s/it][A
+ 13%|█▎        | 16/125 [01:37<11:02,  6.07s/it][A
+ 26%|██▌       | 32/125 [03:02<08:49,  5.70s/it][A
+ 26%|██▌       | 32/125 [03:03<08:52,  5.73s/it][A
+ 26%|██▌       | 32/125 [03:04<08:55,  5.75s/it][A
+ 26%|██▌       | 32/125 [03:04<08:54,  5.74s/it][A
+ 26%|██▌       | 32/125 [03:05<08:57,  5.78s/it][A
+ 26%|██▌       | 32/125 [03:05<08:59,  5.80s/it][A
+ 26%|██▌       | 32/125 [03:07<09:06,  5.87s/it][A
+ 26%|██▌       | 32/125 [03:12<09:17,  6.00s/it][A
+ 38%|███▊      | 48/125 [04:32<07:15,  5.66s/it][A
+ 38%|███▊      | 48/125 [04:34<07:19,  5.70s/it][A
+ 38%|███▊      | 48/125 [04:34<07:19,  5.70s/it][A
+ 38%|███▊      | 48/125 [04:36<07:21,  5.73s/it][A
+ 38%|███▊      | 48/125 [04:37<07:26,  5.80s/it][A
+ 38%|███▊      | 48/125 [04:38<07:26,  5.80s/it][A
+ 38%|███▊      | 48/125 [04:39<07:26,  5.80s/it][A
+ 38%|███▊      | 48/125 [04:44<07:33,  5.88s/it][A
+ 51%|█████     | 64/125 [06:04<05:46,  5.68s/it][A
+ 51%|█████     | 64/125 [06:04<05:46,  5.67s/it][A
+ 51%|█████     | 64/125 [06:07<05:49,  5.73s/it][A
+ 51%|█████     | 64/125 [06:08<05:51,  5.76s/it][A
+ 51%|█████     | 64/125 [06:09<05:52,  5.77s/it][A
+ 51%|█████     | 64/125 [06:10<05:51,  5.76s/it][A
+ 51%|█████     | 64/125 [06:11<05:53,  5.80s/it][A
+ 51%|█████     | 64/125 [06:19<05:59,  5.90s/it][A
+ 64%|██████▍   | 80/125 [07:35<04:15,  5.67s/it][A
+ 64%|██████▍   | 80/125 [07:35<04:16,  5.69s/it][A
+ 64%|██████▍   | 80/125 [07:38<04:17,  5.72s/it][A
+ 64%|██████▍   | 80/125 [07:39<04:18,  5.74s/it][A
+ 64%|██████▍   | 80/125 [07:42<04:18,  5.76s/it][A
+ 64%|██████▍   | 80/125 [07:43<04:21,  5.81s/it][A
+ 64%|██████▍   | 80/125 [07:43<04:20,  5.79s/it][A
+ 64%|██████▍   | 80/125 [07:53<04:25,  5.90s/it][A
+ 77%|███████▋  | 96/125 [09:06<02:44,  5.69s/it][A
+ 77%|███████▋  | 96/125 [09:08<02:45,  5.72s/it][A
+ 77%|███████▋  | 96/125 [09:09<02:45,  5.70s/it][A
+ 77%|███████▋  | 96/125 [09:10<02:46,  5.73s/it][A
+ 77%|███████▋  | 96/125 [09:14<02:47,  5.77s/it][A
+ 77%|███████▋  | 96/125 [09:14<02:47,  5.77s/it][A
+ 77%|███████▋  | 96/125 [09:15<02:47,  5.77s/it][A
+ 77%|███████▋  | 96/125 [09:25<02:49,  5.86s/it][A
+ 90%|████████▉ | 112/125 [10:37<01:13,  5.69s/it][A
+ 90%|████████▉ | 112/125 [10:39<01:14,  5.71s/it][A
+ 90%|████████▉ | 112/125 [10:39<01:13,  5.68s/it][A
+ 90%|████████▉ | 112/125 [10:41<01:14,  5.71s/it][A
+ 90%|████████▉ | 112/125 [10:45<01:14,  5.74s/it][A
+ 90%|████████▉ | 112/125 [10:47<01:15,  5.79s/it][A
+ 90%|████████▉ | 112/125 [10:49<01:15,  5.81s/it][A
+ 90%|████████▉ | 112/125 [11:00<01:16,  5.87s/it][A
+100%|██████████| 125/125 [11:55<00:00,  5.78s/it][A100%|██████████| 125/125 [11:55<00:00,  5.73s/it]
+
+100%|██████████| 125/125 [11:57<00:00,  5.77s/it][A100%|██████████| 125/125 [11:57<00:00,  5.74s/it]
+
+100%|██████████| 125/125 [11:58<00:00,  5.83s/it][A100%|██████████| 125/125 [11:58<00:00,  5.75s/it]
+
+100%|██████████| 125/125 [12:00<00:00,  5.81s/it][A100%|██████████| 125/125 [12:00<00:00,  5.76s/it]
+
+100%|██████████| 125/125 [12:05<00:00,  5.84s/it][A100%|██████████| 125/125 [12:05<00:00,  5.80s/it]
+
+100%|██████████| 125/125 [12:08<00:00,  5.91s/it][A100%|██████████| 125/125 [12:08<00:00,  5.83s/it]
+
+100%|██████████| 125/125 [12:09<00:00,  5.90s/it][A100%|██████████| 125/125 [12:09<00:00,  5.84s/it]
+
+100%|██████████| 125/125 [12:21<00:00,  5.98s/it][A100%|██████████| 125/125 [12:21<00:00,  5.93s/it]
+ 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [12:21<00:08,  1.35it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:04<00:00,  4.77s/it][A100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
+
+100%|██████████| 1/1 [00:06<00:00,  6.21s/it][A100%|██████████| 1/1 [00:06<00:00,  6.21s/it]
+
+100%|██████████| 2/2 [00:09<00:00,  4.52s/it][A100%|██████████| 2/2 [00:09<00:00,  4.52s/it]
+
+100%|██████████| 2/2 [00:09<00:00,  4.58s/it][A100%|██████████| 2/2 [00:09<00:00,  4.58s/it]
+
+100%|██████████| 1/1 [00:25<00:00, 25.84s/it][A100%|██████████| 1/1 [00:25<00:00, 25.84s/it]
+
+100%|██████████| 1/1 [00:26<00:00, 26.36s/it][A100%|██████████| 1/1 [00:26<00:00, 26.36s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.13s/it][A100%|██████████| 2/2 [00:44<00:00, 22.13s/it]
+
+100%|██████████| 2/2 [00:45<00:00, 22.90s/it][A100%|██████████| 2/2 [00:45<00:00, 22.90s/it]
+100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]
+[rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 58186, 'num_samples': 127, 'runtime': 787.5811848528683, 'samples/s': 0.1612532173730451, 'tokens/s': 73.87936776431498}
+100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]
+[rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 63714, 'num_samples': 126, 'runtime': 787.5816594436765, 'samples/s': 0.15998341059516613, 'tokens/s': 80.89827795762234}
+100%|██████████| 1012/1012 [13:07<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]
+
+[rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 60392, 'num_samples': 126, 'runtime': 787.5815443471074, 'samples/s': 0.15998343397502032, 'tokens/s': 76.68031384618594}
+100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s]
+[rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 65422, 'num_samples': 126, 'runtime': 787.5817047469318, 'samples/s': 0.15998340139260436, 'tokens/s': 83.0669371897378}
+100%|██████████| 1012/1012 [13:07<00:00,  1.28it/s][rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 50121, 'num_samples': 126, 'runtime': 787.5818729139864, 'samples/s': 0.1599833672324258, 'tokens/s': 63.63909800838424}[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 59784, 'num_samples': 127, 'runtime': 787.5784716643393, 'samples/s': 0.16125377288642617, 'tokens/s': 75.9086264428512}
+
+[rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 64322, 'num_samples': 127, 'runtime': 787.5815520677716, 'samples/s': 0.16125314218770784, 'tokens/s': 81.6702725338405}
+
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 62942, 'num_samples': 127, 'runtime': 787.5907379984856, 'samples/s': 0.16125126143909047, 'tokens/s': 79.91714092519081}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 19:32:37.206072
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/hypo.zh2si.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/hypo.zh2si.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' km = zh ']'
++ src_lang=km
++ tgt_lang=zh
++ lp=km2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/train.log
+[2025-09-15 19:33:11,897] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 19:33:18,689] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:33:18,967] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:33:19,077] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:33:19,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:33:19,261] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:33:19,358] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:33:19,379] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:33:19,403] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.33s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 19:33:23.411004
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 134485.64 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 6408.12 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.94it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.49it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.40it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.90it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.33it/s][A
+ 13%|█▎        | 16/125 [00:05<00:37,  2.94it/s][A
+ 13%|█▎        | 16/125 [00:05<00:38,  2.80it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.32it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.68it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.28it/s][A
+ 13%|█▎        | 16/125 [00:07<00:51,  2.11it/s][A
+ 26%|██▌       | 32/125 [00:09<00:25,  3.63it/s][A
+ 26%|██▌       | 32/125 [00:10<00:29,  3.13it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.70it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.66it/s][A
+ 26%|██▌       | 32/125 [00:11<00:32,  2.86it/s][A
+ 26%|██▌       | 32/125 [00:12<00:33,  2.80it/s][A
+ 38%|███▊      | 48/125 [00:12<00:21,  3.57it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.92it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.73it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.28it/s][A
+ 38%|███▊      | 48/125 [00:14<00:22,  3.40it/s][A
+ 38%|███▊      | 48/125 [00:14<00:23,  3.24it/s][A
+ 38%|███▊      | 48/125 [00:15<00:22,  3.38it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  3.89it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.64it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:11,  4.06it/s][A
+ 51%|█████     | 64/125 [00:18<00:15,  3.93it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.11it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.21it/s][A
+ 51%|█████     | 64/125 [00:21<00:20,  3.03it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:13,  3.36it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:12,  3.49it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:07,  3.92it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.64it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.51it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.69it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:06,  4.21it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.79it/s][A
+ 64%|██████▍   | 80/125 [00:25<00:14,  3.16it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:03,  3.85it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  3.74it/s][A
+ 77%|███████▋  | 96/125 [00:28<00:07,  3.76it/s][A
+ 77%|███████▋  | 96/125 [00:28<00:08,  3.48it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  4.23it/s][A
+ 77%|███████▋  | 96/125 [00:29<00:08,  3.28it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:03,  3.75it/s][A
+100%|██████████| 125/125 [00:29<00:00,  4.05it/s][A100%|██████████| 125/125 [00:29<00:00,  4.18it/s]
+
+100%|██████████| 125/125 [00:31<00:00,  4.39it/s][A100%|██████████| 125/125 [00:31<00:00,  3.99it/s]
+
+100%|██████████| 125/125 [00:31<00:00,  3.75it/s][A100%|██████████| 125/125 [00:31<00:00,  3.97it/s]
+
+ 90%|████████▉ | 112/125 [00:32<00:03,  3.79it/s][A
+ 90%|████████▉ | 112/125 [00:33<00:03,  3.46it/s][A
+100%|██████████| 125/125 [00:34<00:00,  3.47it/s][A100%|██████████| 125/125 [00:34<00:00,  3.64it/s]
+
+ 90%|████████▉ | 112/125 [00:34<00:04,  3.11it/s][A
+100%|██████████| 125/125 [00:35<00:00,  3.78it/s][A100%|██████████| 125/125 [00:35<00:00,  3.52it/s]
+
+100%|██████████| 125/125 [00:37<00:00,  3.38it/s][A100%|██████████| 125/125 [00:37<00:00,  3.30it/s]
+
+100%|██████████| 125/125 [00:38<00:00,  3.21it/s][A100%|██████████| 125/125 [00:38<00:00,  3.24it/s]
+
+ 26%|██▌       | 32/125 [01:59<06:45,  4.36s/it][A
+ 38%|███▊      | 48/125 [02:03<03:11,  2.49s/it][A
+ 51%|█████     | 64/125 [02:07<01:36,  1.59s/it][A
+ 64%|██████▍   | 80/125 [02:11<00:49,  1.11s/it][A
+ 77%|███████▋  | 96/125 [02:16<00:24,  1.19it/s][A
+ 90%|████████▉ | 112/125 [02:20<00:08,  1.56it/s][A
+100%|██████████| 125/125 [02:23<00:00,  1.88it/s][A100%|██████████| 125/125 [02:23<00:00,  1.15s/it]
+ 99%|█████████▉| 1000/1012 [02:23<00:01,  6.97it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.97it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.97it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.97it/s] 99%|█████████▉| 1000/1012 [02:23<00:01,  6.97it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.96it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.96it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:23<00:01,  6.95it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.44it/s][A100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.30it/s][A100%|██████████| 1/1 [00:00<00:00,  1.30it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.05it/s][A100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.14s/it][A100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
+
+100%|██████████| 2/2 [00:01<00:00,  1.77it/s][A100%|██████████| 2/2 [00:01<00:00,  1.77it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.24it/s][A100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.27it/s][A100%|██████████| 2/2 [00:01<00:00,  1.27it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.14it/s][A100%|██████████| 2/2 [00:01<00:00,  1.14it/s]
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]
+
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]
+[rank2] {'num_prompt_tokens': 45616, 'num_generated_tokens': 4491, 'num_samples': 127, 'runtime': 145.53471057116985, 'samples/s': 0.8726440551643798, 'tokens/s': 30.858617730261653}
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]
+
+[rank1] {'num_prompt_tokens': 41834, 'num_generated_tokens': 3618, 'num_samples': 127, 'runtime': 145.54704684764147, 'samples/s': 0.8725700916003023, 'tokens/s': 24.85794166464483}
+[rank5] {'num_prompt_tokens': 37873, 'num_generated_tokens': 3303, 'num_samples': 126, 'runtime': 145.5419071111828, 'samples/s': 0.8657300326822411, 'tokens/s': 22.694494428170177}
+[rank4] {'num_prompt_tokens': 41429, 'num_generated_tokens': 3159, 'num_samples': 126, 'runtime': 145.54665123671293, 'samples/s': 0.8657018140189099, 'tokens/s': 21.70438119433124}
+[rank7] {'num_prompt_tokens': 37707, 'num_generated_tokens': 3240, 'num_samples': 126, 'runtime': 145.53487974964082, 'samples/s': 0.8657718357053232, 'tokens/s': 22.26270434670831}
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s][rank3] {'num_prompt_tokens': 41276, 'num_generated_tokens': 3513, 'num_samples': 127, 'runtime': 145.54662474803627, 'samples/s': 0.8725726221398583, 'tokens/s': 24.136595445490727}
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]
+100%|██████████| 1012/1012 [02:25<00:00,  6.95it/s]
+[rank0] {'num_prompt_tokens': 44859, 'num_generated_tokens': 3762, 'num_samples': 127, 'runtime': 145.56143709272146, 'samples/s': 0.8724838290728197, 'tokens/s': 25.844757204503527}[rank6] {'num_prompt_tokens': 35077, 'num_generated_tokens': 3460, 'num_samples': 126, 'runtime': 145.54515072330832, 'samples/s': 0.8657107390649857, 'tokens/s': 23.772691723530563}
+
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 19:35:52.577049
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/hypo.km2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/hypo.km2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=km
++ lp=zh2km
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/train.log
+[2025-09-15 19:36:25,407] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 19:36:32,187] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:36:32,367] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:36:32,570] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:36:32,619] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:36:32,703] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 19:36:32,782] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 19:36:32,811] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 19:36:32,838] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.60s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
+
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 19:36:36.949381
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 1012 examples [00:00, 130898.19 examples/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.00it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]
+Map (num_proc=8):   0%|          | 0/1012 [00:00<?, ? examples/s]Map (num_proc=8): 100%|██████████| 1012/1012 [00:00<00:00, 5651.27 examples/s]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:15<01:45,  1.03it/s][A
+ 13%|█▎        | 16/125 [00:17<02:01,  1.11s/it][A
+ 13%|█▎        | 16/125 [01:32<10:28,  5.77s/it][A
+ 13%|█▎        | 16/125 [01:33<10:37,  5.85s/it][A
+ 13%|█▎        | 16/125 [01:33<10:39,  5.87s/it][A
+ 13%|█▎        | 16/125 [01:34<10:41,  5.88s/it][A
+ 13%|█▎        | 16/125 [01:35<10:48,  5.95s/it][A
+ 13%|█▎        | 16/125 [01:35<10:52,  5.98s/it][A
+ 26%|██▌       | 32/125 [01:47<04:33,  2.94s/it][A
+ 26%|██▌       | 32/125 [01:48<05:53,  3.81s/it][A
+ 26%|██▌       | 32/125 [01:51<06:04,  3.92s/it][A
+ 38%|███▊      | 48/125 [02:06<03:18,  2.58s/it][A
+ 26%|██▌       | 32/125 [03:05<08:56,  5.77s/it][A
+ 26%|██▌       | 32/125 [03:06<09:02,  5.83s/it][A
+ 26%|██▌       | 32/125 [03:07<09:03,  5.85s/it][A
+ 26%|██▌       | 32/125 [03:09<09:09,  5.91s/it][A
+ 26%|██▌       | 32/125 [03:10<09:11,  5.94s/it][A
+ 38%|███▊      | 48/125 [03:21<05:28,  4.27s/it][A
+ 38%|███▊      | 48/125 [03:23<06:06,  4.76s/it][A
+ 51%|█████     | 64/125 [03:38<03:53,  3.83s/it][A
+ 51%|█████     | 64/125 [03:41<03:22,  3.33s/it][A
+ 38%|███▊      | 48/125 [04:36<07:21,  5.74s/it][A
+ 38%|███▊      | 48/125 [04:39<07:27,  5.81s/it][A
+ 38%|███▊      | 48/125 [04:39<07:28,  5.83s/it][A
+ 38%|███▊      | 48/125 [04:41<07:29,  5.84s/it][A
+ 38%|███▊      | 48/125 [04:42<07:31,  5.86s/it][A
+ 51%|█████     | 64/125 [04:52<04:55,  4.84s/it][A
+ 51%|█████     | 64/125 [05:03<04:11,  4.12s/it][A
+ 64%|██████▍   | 80/125 [05:10<03:23,  4.52s/it][A
+ 64%|██████▍   | 80/125 [05:14<03:09,  4.21s/it][A
+ 51%|█████     | 64/125 [06:09<05:51,  5.77s/it][A
+ 51%|█████     | 64/125 [06:12<05:55,  5.82s/it][A
+ 51%|█████     | 64/125 [06:14<05:55,  5.83s/it][A
+ 51%|█████     | 64/125 [06:16<05:57,  5.86s/it][A
+ 64%|██████▍   | 80/125 [06:26<03:54,  5.22s/it][A
+ 64%|██████▍   | 80/125 [06:36<03:09,  4.20s/it][A
+ 64%|██████▍   | 80/125 [06:36<03:32,  4.73s/it][A
+ 77%|███████▋  | 96/125 [06:43<02:23,  4.96s/it][A
+ 77%|███████▋  | 96/125 [06:46<02:17,  4.75s/it][A
+ 90%|████████▉ | 112/125 [06:59<00:47,  3.67s/it][A
+ 64%|██████▍   | 80/125 [07:41<04:19,  5.77s/it][A
+ 64%|██████▍   | 80/125 [07:45<04:21,  5.82s/it][A
+ 64%|██████▍   | 80/125 [07:48<04:22,  5.83s/it][A
+ 77%|███████▋  | 96/125 [07:58<02:36,  5.38s/it][A
+ 77%|███████▋  | 96/125 [07:58<02:00,  4.17s/it][A
+ 77%|███████▋  | 96/125 [08:08<02:17,  4.73s/it][A
+ 77%|███████▋  | 96/125 [08:09<02:27,  5.09s/it][A
+ 90%|████████▉ | 112/125 [08:16<00:51,  4.00s/it][A
+ 90%|████████▉ | 112/125 [08:18<01:05,  5.06s/it][A
+100%|██████████| 125/125 [08:19<00:00,  4.35s/it][A100%|██████████| 125/125 [08:19<00:00,  4.00s/it]
+
+ 77%|███████▋  | 96/125 [09:18<02:48,  5.81s/it][A
+ 77%|███████▋  | 96/125 [09:21<02:49,  5.84s/it][A
+ 90%|████████▉ | 112/125 [09:29<01:00,  4.65s/it][A
+100%|██████████| 125/125 [09:37<00:00,  4.60s/it][A100%|██████████| 125/125 [09:37<00:00,  4.62s/it]
+
+100%|██████████| 125/125 [09:38<00:00,  5.36s/it][A100%|██████████| 125/125 [09:38<00:00,  4.62s/it]
+
+ 90%|████████▉ | 112/125 [09:41<01:08,  5.31s/it][A
+ 90%|████████▉ | 112/125 [09:42<01:06,  5.10s/it][A
+100%|██████████| 125/125 [09:44<00:00,  3.70s/it][A100%|██████████| 125/125 [09:44<00:00,  4.68s/it]
+
+100%|██████████| 125/125 [09:56<00:00,  4.01s/it][A100%|██████████| 125/125 [09:56<00:00,  4.77s/it]
+
+ 90%|████████▉ | 112/125 [10:53<01:16,  5.86s/it][A
+ 90%|████████▉ | 112/125 [10:55<01:15,  5.85s/it][A
+100%|██████████| 125/125 [11:01<00:00,  5.55s/it][A100%|██████████| 125/125 [11:01<00:00,  5.30s/it]
+
+100%|██████████| 125/125 [12:13<00:00,  5.94s/it][A100%|██████████| 125/125 [12:13<00:00,  5.87s/it]
+
+100%|██████████| 125/125 [12:16<00:00,  5.96s/it][A100%|██████████| 125/125 [12:16<00:00,  5.90s/it]
+ 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s] 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s] 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s] 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s] 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s] 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s] 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/2 [00:00<?, ?it/s][A[A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [12:17<00:08,  1.36it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:03<00:00,  3.08s/it][A100%|██████████| 1/1 [00:03<00:00,  3.08s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.11s/it][A100%|██████████| 1/1 [00:04<00:00,  4.11s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.80s/it][A100%|██████████| 1/1 [00:04<00:00,  4.80s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.81s/it][A100%|██████████| 1/1 [00:04<00:00,  4.81s/it]
+
+100%|██████████| 2/2 [00:06<00:00,  3.15s/it][A100%|██████████| 2/2 [00:06<00:00,  3.15s/it]
+
+100%|██████████| 2/2 [00:08<00:00,  4.44s/it][A100%|██████████| 2/2 [00:08<00:00,  4.44s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.23s/it][A100%|██████████| 2/2 [00:44<00:00, 22.23s/it]
+
+100%|███��██████| 2/2 [00:44<00:00, 22.50s/it][A100%|██████████| 2/2 [00:44<00:00, 22.50s/it]
+100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s]100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s]100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s]100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s][rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 32680, 'num_samples': 127, 'runtime': 782.0991492401809, 'samples/s': 0.16238350357928671, 'tokens/s': 41.78498344071724}
+100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s]100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s]100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+[rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 32613, 'num_samples': 127, 'runtime': 782.1002112589777, 'samples/s': 0.1623832830777057, 'tokens/s': 41.69925992923792}
+100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+[rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 32366, 'num_samples': 126, 'runtime': 782.0988410618156, 'samples/s': 0.16110495679668344, 'tokens/s': 41.383516124456}
+100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s][rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 34108, 'num_samples': 126, 'runtime': 782.098516209051, 'samples/s': 0.16110502371330523, 'tokens/s': 43.61087419693187}
+
+100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s][rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 33151, 'num_samples': 126, 'runtime': 782.1007786355913, 'samples/s': 0.16110455767581827, 'tokens/s': 42.38712056754803}
+[rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 36631, 'num_samples': 126, 'runtime': 782.1009079031646, 'samples/s': 0.16110453104805833, 'tokens/s': 46.836667276360515}
+100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+100%|██████████| 1012/1012 [13:02<00:00,  1.27it/s][rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 34880, 'num_samples': 127, 'runtime': 782.1113506313413, 'samples/s': 0.16238097030235168, 'tokens/s': 44.597230268866355}
+100%|██████████| 1012/1012 [13:02<00:00,  1.29it/s]
+[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 34261, 'num_samples': 127, 'runtime': 782.1019120253623, 'samples/s': 0.1623829299574473, 'tokens/s': 43.806311521827574}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 19:49:42.741818
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/hypo.zh2km.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/hypo.zh2km.txt
++ metric=bleu,comet_22
++ python /mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py --metric bleu,comet_22 --comet_22_path /mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt --xcomet_xxl_path /mnt/nvme2/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt --lang_pair en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km --src_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh --ref_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km --hypo_file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915/base/best/decode_result/zh2km/hypo.zh2km.txt --record_file result_mt.xlsx
+[2025-09-15 19:50:00,711] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 169, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 126, in main
+    comet_22_model = load_from_checkpoint(args.comet_22_path, reload_hparams=True)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/comet/models/__init__.py", line 79, in load_from_checkpoint
+    raise Exception(f"Invalid checkpoint path: {checkpoint_path}")
+Exception: Invalid checkpoint path: /mnt/nvme2/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
+++++ readlink -f sft_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/sft_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ model_name=Qwen3-4B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl
++ per_device_train_batch_size=12
++ gradient_accumulation_steps=1
++ max_lengths=1024
++ num_train_epochs=1
++ task=sft_0915_0.1
++ tag=base
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base
++ cp sft_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/train.log
++ swift sft --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --load_from_cache_file --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 12 --per_device_eval_batch_size 12 --learning_rate 2e-5 --gradient_accumulation_steps 1 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 0.1 --save_steps 0.1 --logging_steps 10 --max_length 1024 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base --create_checkpoint_symlink --warmup_ratio 0.01 --dataloader_num_workers 8 --dataset_num_proc 16 --seed 42 --report_to tensorboard --save_only_model --save_total_limit 3 --ddp_timeout 180000000
+[2025-09-15 22:03:15,512] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/sft.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --load_from_cache_file --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 12 --per_device_eval_batch_size 12 --learning_rate 2e-5 --gradient_accumulation_steps 1 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 0.1 --save_steps 0.1 --logging_steps 10 --max_length 1024 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base --create_checkpoint_symlink --warmup_ratio 0.01 --dataloader_num_workers 8 --dataset_num_proc 16 --seed 42 --report_to tensorboard --save_only_model --save_total_limit 3 --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:03:22,482] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:03:22,544] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:03:22,701] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:03:22,733] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:03:22,878] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:03:22,928] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:03:22,934] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:03:22,936] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[2025-09-15 22:03:24,028] [INFO] [comm.py:637:init_distributed] cdb=None
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[2025-09-15 22:03:24,162] [INFO] [comm.py:637:init_distributed] cdb=None
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-15 22:03:24,341] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:03:24,341] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-09-15 22:03:24,699] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:03:24,785] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:03:24,891] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:03:24,893] [INFO] [comm.py:637:init_distributed] cdb=None
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=True,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/train.jsonl'],
+dataset_num_proc=16,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=0.1,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=1,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=1024,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=1.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base,
+overwrite_output_dir=False,
+packing=False,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=12,
+per_device_train_batch_size=12,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base,
+save_on_each_node=False,
+save_only_model=True,
+save_safetensors=True,
+save_steps=0.1,
+save_strategy=steps,
+save_total_limit=3,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.01,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] attn_impl: flash_attn
+[2025-09-15 22:03:24,922] [INFO] [comm.py:637:init_distributed] cdb=None
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.04s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.07it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.06it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.02it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.19s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.00s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.03s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.08s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.15s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.31it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.06s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.05s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.43it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.44it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.01s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.14s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.01s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.05s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.30it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.42it/s]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-15 22:03:27.557306
+[INFO:swift] swift.__version__: 3.7.3
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.10s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.37it/s]
+[INFO:swift] train_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 100792
+})
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 500
+})
+[INFO:swift] [INPUT_IDS] [151644, 872, 198, 27473, 279, 2701, 1467, 504, 6364, 1119, 8453, 510, 22574, 25, 1597, 773, 1657, 10488, 633, 13628, 979, 429, 8573, 624, 44923, 25, 151645, 198, 151644, 77091, 198, 39165, 106334, 99726, 13343, 3837, 49434, 239, 79478, 103939, 28726, 20726, 99555, 101135, 1773, 151645]
+[INFO:swift] [INPUT] <|im_start|>user
+Translate the following text from English into Chinese:
+English: And so many opportunities get missed when that happens.
+Chinese:<|im_end|>
+<|im_start|>assistant
+当这种情况发生时， 我们就会错失很多机会。<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 39165, 106334, 99726, 13343, 3837, 49434, 239, 79478, 103939, 28726, 20726, 99555, 101135, 1773, 151645]
+[INFO:swift] [LABELS] [-100 * 31]当这种情况发生时， 我们就会错失很多机会。<|im_end|>
+[INFO:swift] Dataset Token Length: 116.611973±73.344357, min=25.000000, max=781.000000, size=100792
+[INFO:swift] Dataset Token Length: 136.436000±75.772303, min=29.000000, max=509.000000, size=500
+[INFO:swift] The TrainArguments will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/args.json
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen3ForCausalLM: 4022.4681M Params (4022.4681M Trainable [100.0000%]), 0.0001M Buffers.
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/logging.jsonl
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 0/1050 [00:00<?, ?it/s]/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+[INFO:swift] use_logits_to_keep: True
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 1/1050 [00:01<28:21,  1.62s/it]                                                       {'loss': 2.08473206, 'token_acc': 0.63709677, 'grad_norm': 23.47024345, 'learning_rate': 1.82e-06, 'memory(GiB)': 24.18, 'train_speed(iter/s)': 0.059064, 'epoch': 0.0, 'global_step/max_steps': '1/1050', 'percentage': '0.10%', 'elapsed_time': '1s', 'remaining_time': '28m 28s'}
+Train:   0%|          | 1/1050 [00:01<28:21,  1.62s/it]Train:   0%|          | 1/1050 [00:01<28:21,  1.62s/it]Train:   0%|          | 2/1050 [00:02<17:23,  1.00it/s]Train:   0%|          | 3/1050 [00:02<14:05,  1.24it/s]Train:   0%|          | 4/1050 [00:03<11:56,  1.46it/s]Train:   0%|          | 5/1050 [00:03<11:40,  1.49it/s]Train:   1%|          | 6/1050 [00:04<11:28,  1.52it/s]Train:   1%|          | 7/1050 [00:05<11:13,  1.55it/s]Train:   1%|          | 8/1050 [00:05<10:10,  1.71it/s]Train:   1%|          | 9/1050 [00:06<10:08,  1.71it/s]Train:   1%|          | 10/1050 [00:06<09:50,  1.76it/s]                                                        {'loss': 1.69497681, 'token_acc': 0.65907118, 'grad_norm': 5.57801056, 'learning_rate': 1.818e-05, 'memory(GiB)': 41.03, 'train_speed(iter/s)': 0.453811, 'epoch': 0.01, 'global_step/max_steps': '10/1050', 'percentage': '0.95%', 'elapsed_time': '6s', 'remaining_time': '11m 40s'}
+Train:   1%|          | 10/1050 [00:06<09:50,  1.76it/s]Train:   1%|          | 10/1050 [00:06<09:50,  1.76it/s]Train:   1%|          | 11/1050 [00:07<10:19,  1.68it/s]Train:   1%|          | 12/1050 [00:07<10:00,  1.73it/s]Train:   1%|          | 13/1050 [00:08<10:04,  1.71it/s]Train:   1%|▏         | 14/1050 [00:09<09:43,  1.77it/s]Train:   1%|▏         | 15/1050 [00:09<10:10,  1.70it/s]Train:   2%|▏         | 16/1050 [00:10<09:37,  1.79it/s]Train:   2%|▏         | 17/1050 [00:10<09:04,  1.90it/s]Train:   2%|▏         | 18/1050 [00:11<09:38,  1.79it/s]Train:   2%|▏         | 19/1050 [00:11<09:16,  1.85it/s]Train:   2%|▏         | 20/1050 [00:12<09:03,  1.89it/s]                                                        {'loss': 1.49651623, 'token_acc': 0.66159464, 'grad_norm': 4.9598937, 'learning_rate': 2e-05, 'memory(GiB)': 41.03, 'train_speed(iter/s)': 0.725535, 'epoch': 0.02, 'global_step/max_steps': '20/1050', 'percentage': '1.90%', 'elapsed_time': '12s', 'remaining_time': '10m 31s'}
+Train:   2%|▏         | 20/1050 [00:12<09:03,  1.89it/s]Train:   2%|▏         | 20/1050 [00:12<09:03,  1.89it/s]Train:   2%|▏         | 21/1050 [00:12<09:56,  1.73it/s]Train:   2%|▏         | 22/1050 [00:13<09:52,  1.74it/s]W0915 22:04:00.855000 134437579875840 torch/distributed/elastic/agent/server/api.py:688] Received Signals.SIGTERM death signal, shutting down workers
+W0915 22:04:00.856000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340273 closing signal SIGTERM
+W0915 22:04:00.858000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340274 closing signal SIGTERM
+W0915 22:04:00.863000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340275 closing signal SIGTERM
+W0915 22:04:00.864000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340276 closing signal SIGTERM
+W0915 22:04:00.864000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340277 closing signal SIGTERM
+W0915 22:04:00.864000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340278 closing signal SIGTERM
+W0915 22:04:00.864000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340279 closing signal SIGTERM
+W0915 22:04:00.864000 134437579875840 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1340280 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
+    result = self._invoke_run(role)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 835, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 79, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 1340195 got signal: 15
++ bash inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
++ comet_model=/mnt/nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme3/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
++ lang_pair_strs=en2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=en
++ lp=zh2en
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
++ lang_pair_strs=en2zh,zh2en
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' de = zh ']'
++ src_lang=de
++ tgt_lang=zh
++ lp=de2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=de
++ lp=zh2de
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' ru = zh ']'
++ src_lang=ru
++ tgt_lang=zh
++ lp=ru2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=ru
++ lp=zh2ru
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' bn = zh ']'
++ src_lang=bn
++ tgt_lang=zh
++ lp=bn2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=bn
++ lp=zh2bn
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' hi = zh ']'
++ src_lang=hi
++ tgt_lang=zh
++ lp=hi2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=hi
++ lp=zh2hi
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' th = zh ']'
++ src_lang=th
++ tgt_lang=zh
++ lp=th2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=th
++ lp=zh2th
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' jv = zh ']'
++ src_lang=jv
++ tgt_lang=zh
++ lp=jv2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=jv
++ lp=zh2jv
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' sw = zh ']'
++ src_lang=sw
++ tgt_lang=zh
++ lp=sw2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=sw
++ lp=zh2sw
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' si = zh ']'
++ src_lang=si
++ tgt_lang=zh
++ lp=si2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=si
++ lp=zh2si
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' km = zh ']'
++ src_lang=km
++ tgt_lang=zh
++ lp=km2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=km
++ lp=zh2km
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
++ metric=bleu,comet_22
++ python /mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py --metric bleu,comet_22 --comet_22_path /mnt/nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt --xcomet_xxl_path /mnt/nvme3/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt --lang_pair en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km --src_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh --ref_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km --hypo_file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt --record_file result_mt.xlsx
+[2025-09-15 22:04:13,866] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt`
+++++ readlink -f sft_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/sft_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ model_name=Qwen3-4B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/train.jsonl
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/valid.jsonl
++ per_device_train_batch_size=12
++ gradient_accumulation_steps=1
++ max_lengths=1024
++ num_train_epochs=1
++ task=sft_0915_0.1
++ tag=base
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base
++ cp sft_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base
++ swift sft --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --load_from_cache_file --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/train.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/valid.jsonl --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 12 --per_device_eval_batch_size 12 --learning_rate 2e-5 --gradient_accumulation_steps 1 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 0.1 --save_steps 0.1 --logging_steps 10 --max_length 1024 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base --create_checkpoint_symlink --warmup_ratio 0.01 --dataloader_num_workers 8 --dataset_num_proc 16 --seed 42 --report_to tensorboard --save_only_model --save_total_limit 3 --ddp_timeout 180000000
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/train.log
+Encoder model frozen.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+evaluate zh2en
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt -l zh-en
+
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 171, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 153, in main
+    score = bleu_scoring(ref_file, hypo_file, lp)
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 25, in bleu_scoring
+    return float(score.stdout.strip()) 
+ValueError: could not convert string to float: ''
+[2025-09-15 22:04:28,568] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/sft.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --load_from_cache_file --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/train.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/valid.jsonl --torch_dtype bfloat16 --num_train_epochs 1 --per_device_train_batch_size 12 --per_device_eval_batch_size 12 --learning_rate 2e-5 --gradient_accumulation_steps 1 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 0.1 --save_steps 0.1 --logging_steps 10 --max_length 1024 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base --create_checkpoint_symlink --warmup_ratio 0.01 --dataloader_num_workers 8 --dataset_num_proc 16 --seed 42 --report_to tensorboard --save_only_model --save_total_limit 3 --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:04:35,527] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:04:35,702] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:04:35,950] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:04:36,044] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:04:36,174] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:04:36,192] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:04:36,222] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:04:36,233] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-15 22:04:37,030] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:04:37,030] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=True,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/train.jsonl'],
+dataset_num_proc=16,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=0.1,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=1,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=1024,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=1.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base,
+overwrite_output_dir=False,
+packing=False,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=12,
+per_device_train_batch_size=12,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base,
+save_on_each_node=False,
+save_only_model=True,
+save_safetensors=True,
+save_steps=0.1,
+save_strategy=steps,
+save_total_limit=3,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/sft_0915_0.1/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.01,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] attn_impl: flash_attn
+[2025-09-15 22:04:37,317] [INFO] [comm.py:637:init_distributed] cdb=None
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][2025-09-15 22:04:38,110] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:04:38,340] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:04:38,433] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:04:38,483] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:04:38,489] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-15 22:04:38,500] [INFO] [comm.py:637:init_distributed] cdb=None
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.17s/it]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:02<00:04,  2.02s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.11it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.10s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.07s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.09s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.37it/s]
+Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.08s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.09s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.19s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:03<00:01,  1.45s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.03s/it]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-15 22:04:40.593985
+[INFO:swift] swift.__version__: 3.7.3
+Setting num_proc from 16 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 55258 examples [00:00, 424769.03 examples/s]Generating train split: 55258 examples [00:00, 423280.35 examples/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:01<00:00,  1.13it/s]Loading checkpoint shards: 100%|█████���████| 3/3 [00:01<00:00,  1.67it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.10s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.12s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.32it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.09s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.36it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.13s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.32it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.25s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.19it/s]
+Map (num_proc=16):   0%|          | 0/55258 [00:00<?, ? examples/s]Map (num_proc=16):  38%|███▊      | 20724/55258 [00:00<00:00, 206678.94 examples/s]Map (num_proc=16): 100%|██████████| 55258/55258 [00:00<00:00, 226542.29 examples/s]
+Setting num_proc from 16 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 500 examples [00:00, 148502.48 examples/s]
+Map (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]Map (num_proc=16): 100%|██████████| 500/500 [00:00<00:00, 3247.59 examples/s]
+[INFO:swift] train_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 55258
+})
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 500
+})
+Map (num_proc=16):   0%|          | 0/55258 [00:00<?, ? examples/s]Map (num_proc=16):   2%|▏         | 1000/55258 [00:00<00:51, 1043.74 examples/s]Map (num_proc=16):   4%|▎         | 2000/55258 [00:01<00:26, 2033.73 examples/s]Map (num_proc=16):   9%|▉         | 5000/55258 [00:01<00:09, 5395.64 examples/s]Map (num_proc=16):  13%|█▎        | 7000/55258 [00:01<00:06, 7502.83 examples/s]Map (num_proc=16):  16%|█▋        | 9000/55258 [00:01<00:04, 9661.82 examples/s]Map (num_proc=16):  22%|██▏       | 12000/55258 [00:01<00:03, 13466.46 examples/s]Map (num_proc=16):  27%|██▋       | 15000/55258 [00:01<00:02, 16593.46 examples/s]Map (num_proc=16):  33%|███▎      | 18000/55258 [00:01<00:02, 18059.91 examples/s]Map (num_proc=16):  38%|███▊      | 21000/55258 [00:02<00:01, 19837.98 examples/s]Map (num_proc=16):  48%|████▊     | 26454/55258 [00:02<00:01, 25329.68 examples/s]Map (num_proc=16):  59%|█████▊    | 32454/55258 [00:02<00:00, 33443.53 examples/s]Map (num_proc=16):  66%|██████▌   | 36362/55258 [00:02<00:00, 31003.86 examples/s]Map (num_proc=16):  73%|███████▎  | 40270/55258 [00:02<00:00, 29311.54 examples/s]Map (num_proc=16):  80%|███████▉  | 44178/55258 [00:02<00:00, 27523.64 examples/s]Map (num_proc=16):  87%|████████▋ | 48086/55258 [00:02<00:00, 26739.98 examples/s]Map (num_proc=16):  92%|█████████▏| 50993/55258 [00:03<00:00, 21932.29 examples/s]Map (num_proc=16):  97%|█████████▋| 53446/55258 [00:03<00:00, 21412.75 examples/s]Map (num_proc=16): 100%|██████████| 55258/55258 [00:03<00:00, 15647.21 examples/s]
+Map (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]Map (num_proc=16):   6%|▋         | 32/500 [00:00<00:07, 58.71 examples/s]Map (num_proc=16):  19%|█▉        | 96/500 [00:00<00:02, 145.03 examples/s]Map (num_proc=16):  32%|███▏      | 159/500 [00:00<00:01, 198.44 examples/s]Map (num_proc=16):  44%|████▍     | 221/500 [00:01<00:01, 267.25 examples/s]Map (num_proc=16):  57%|█████▋    | 283/500 [00:01<00:00, 265.50 examples/s]Map (num_proc=16):  69%|██████▉   | 345/500 [00:01<00:00, 262.56 examples/s]Map (num_proc=16):  75%|███████▌  | 376/500 [00:01<00:00, 265.44 examples/s]Map (num_proc=16):  88%|████████▊ | 438/500 [00:01<00:00, 288.48 examples/s]Map (num_proc=16):  94%|█████████▍| 469/500 [00:01<00:00, 283.21 examples/s]Map (num_proc=16): 100%|██████████| 500/500 [00:02<00:00, 285.33 examples/s]Map (num_proc=16): 100%|██████████| 500/500 [00:02<00:00, 230.09 examples/s]
+[INFO:swift] [INPUT_IDS] [151644, 872, 198, 27473, 279, 2701, 1467, 504, 6364, 1119, 8453, 510, 22574, 25, 14263, 278, 4344, 304, 7722, 35481, 7576, 1033, 2797, 11, 448, 458, 5461, 315, 220, 18, 13, 19, 4, 6513, 304, 279, 1156, 4279, 624, 44923, 25, 151645, 198, 151644, 77091, 198, 99705, 99800, 97480, 105419, 33071, 106443, 104363, 100687, 3837, 102717, 101200, 102164, 18, 13, 19, 4, 1773, 151645]
+[INFO:swift] [INPUT] <|im_start|>user
+Translate the following text from English into Chinese:
+English: Seasonal changes in fresh vegetable prices were clear, with an average of 3.4% growth in the first half.
+Chinese:<|im_end|>
+<|im_start|>assistant
+鲜菜价格季节性变动特征明显，上半年平均上涨3.4%。<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99705, 99800, 97480, 105419, 33071, 106443, 104363, 100687, 3837, 102717, 101200, 102164, 18, 13, 19, 4, 1773, 151645]
+[INFO:swift] [LABELS] [-100 * 46]鲜菜价格季节性变动特征明显，上半年平均上涨3.4%。<|im_end|>
+[INFO:swift] Dataset Token Length: 116.227279±73.266902, min=25.000000, max=781.000000, size=55258
+[INFO:swift] Dataset Token Length: 141.064000±80.870909, min=34.000000, max=507.000000, size=500
+[INFO:swift] The TrainArguments will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/args.json
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen3ForCausalLM: 4022.4681M Params (4022.4681M Trainable [100.0000%]), 0.0001M Buffers.
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/logging.jsonl
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 0/576 [00:00<?, ?it/s]/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+[INFO:swift] use_logits_to_keep: True
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 1/576 [00:01<13:32,  1.41s/it]                                                      {'loss': 1.64783251, 'token_acc': 0.64919355, 'grad_norm': 16.45900345, 'learning_rate': 3.33e-06, 'memory(GiB)': 24.66, 'train_speed(iter/s)': 0.061591, 'epoch': 0.0, 'global_step/max_steps': '1/576', 'percentage': '0.17%', 'elapsed_time': '1s', 'remaining_time': '13m 34s'}
+Train:   0%|          | 1/576 [00:01<13:32,  1.41s/it]Train:   0%|          | 1/576 [00:01<13:32,  1.41s/it]Train:   0%|          | 2/576 [00:01<08:51,  1.08it/s]Train:   1%|          | 3/576 [00:02<07:08,  1.34it/s]Train:   1%|          | 4/576 [00:03<06:39,  1.43it/s]Train:   1%|          | 5/576 [00:03<06:37,  1.44it/s]Train:   1%|          | 6/576 [00:04<06:49,  1.39it/s]Train:   1%|          | 7/576 [00:05<06:04,  1.56it/s]Train:   1%|▏         | 8/576 [00:05<06:02,  1.56it/s]Train:   2%|▏         | 9/576 [00:06<05:55,  1.59it/s]Train:   2%|▏         | 10/576 [00:06<05:49,  1.62it/s]                                                       {'loss': 1.43716155, 'token_acc': 0.69821453, 'grad_norm': 4.41848469, 'learning_rate': 2e-05, 'memory(GiB)': 31.68, 'train_speed(iter/s)': 0.459917, 'epoch': 0.02, 'global_step/max_steps': '10/576', 'percentage': '1.74%', 'elapsed_time': '6s', 'remaining_time': '6m 31s'}
+Train:   2%|▏         | 10/576 [00:06<05:49,  1.62it/s]Train:   2%|▏         | 10/576 [00:06<05:49,  1.62it/s]Train:   2%|▏         | 11/576 [00:07<05:22,  1.75it/s]Train:   2%|▏         | 12/576 [00:07<05:12,  1.80it/s]Train:   2%|▏         | 13/576 [00:08<05:07,  1.83it/s]Train:   2%|▏         | 14/576 [00:08<04:59,  1.88it/s]Train:   3%|▎         | 15/576 [00:09<04:57,  1.89it/s]Train:   3%|▎         | 16/576 [00:10<05:02,  1.85it/s]Train:   3%|▎         | 17/576 [00:10<05:10,  1.80it/s]Train:   3%|▎         | 18/576 [00:11<05:24,  1.72it/s]Train:   3%|▎         | 19/576 [00:11<05:08,  1.80it/s]Train:   3%|▎         | 20/576 [00:12<05:29,  1.69it/s]                                                       {'loss': 1.29199581, 'token_acc': 0.66970588, 'grad_norm': 4.03205919, 'learning_rate': 1.997e-05, 'memory(GiB)': 51.01, 'train_speed(iter/s)': 0.733901, 'epoch': 0.03, 'global_step/max_steps': '20/576', 'percentage': '3.47%', 'elapsed_time': '12s', 'remaining_time': '5m 45s'}
+Train:   3%|▎         | 20/576 [00:12<05:29,  1.69it/s]Train:   3%|▎         | 20/576 [00:12<05:29,  1.69it/s]Train:   4%|▎         | 21/576 [00:13<05:29,  1.68it/s]Train:   4%|▍         | 22/576 [00:13<05:39,  1.63it/s]Train:   4%|▍         | 23/576 [00:14<05:32,  1.66it/s]Train:   4%|▍         | 24/576 [00:14<05:40,  1.62it/s]Train:   4%|▍         | 25/576 [00:15<05:54,  1.56it/s]Train:   5%|▍         | 26/576 [00:16<06:00,  1.53it/s]Train:   5%|▍         | 27/576 [00:16<05:40,  1.61it/s]Train:   5%|▍         | 28/576 [00:17<05:16,  1.73it/s]Train:   5%|▌         | 29/576 [00:17<05:08,  1.77it/s]Train:   5%|▌         | 30/576 [00:18<05:11,  1.75it/s]                                                       {'loss': 1.23797531, 'token_acc': 0.70508744, 'grad_norm': 3.70418501, 'learning_rate': 1.991e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 0.90221, 'epoch': 0.05, 'global_step/max_steps': '30/576', 'percentage': '5.21%', 'elapsed_time': '18s', 'remaining_time': '5m 35s'}
+Train:   5%|▌         | 30/576 [00:18<05:11,  1.75it/s]Train:   5%|▌         | 30/576 [00:18<05:11,  1.75it/s]Train:   5%|▌         | 31/576 [00:19<05:33,  1.63it/s]Train:   6%|▌         | 32/576 [00:19<05:12,  1.74it/s]Train:   6%|▌         | 33/576 [00:20<05:32,  1.63it/s]Train:   6%|▌         | 34/576 [00:20<05:32,  1.63it/s]Train:   6%|▌         | 35/576 [00:21<05:10,  1.74it/s]Train:   6%|▋         | 36/576 [00:22<05:33,  1.62it/s]Train:   6%|▋         | 37/576 [00:22<05:37,  1.60it/s]Train:   7%|▋         | 38/576 [00:23<05:34,  1.61it/s]Train:   7%|▋         | 39/576 [00:23<05:20,  1.68it/s]Train:   7%|▋         | 40/576 [00:24<05:13,  1.71it/s]                                                       {'loss': 1.22029495, 'token_acc': 0.7065481, 'grad_norm': 3.64529181, 'learning_rate': 1.982e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.017329, 'epoch': 0.07, 'global_step/max_steps': '40/576', 'percentage': '6.94%', 'elapsed_time': '24s', 'remaining_time': '5m 28s'}
+Train:   7%|▋         | 40/576 [00:24<05:13,  1.71it/s]Train:   7%|▋         | 40/576 [00:24<05:13,  1.71it/s]Train:   7%|▋         | 41/576 [00:25<05:09,  1.73it/s]Train:   7%|▋         | 42/576 [00:25<05:00,  1.78it/s]Train:   7%|▋         | 43/576 [00:26<05:06,  1.74it/s]Train:   8%|▊         | 44/576 [00:26<04:50,  1.83it/s]Train:   8%|▊         | 45/576 [00:27<05:20,  1.66it/s]Train:   8%|▊         | 46/576 [00:27<05:13,  1.69it/s]Train:   8%|▊         | 47/576 [00:28<05:17,  1.66it/s]Train:   8%|▊         | 48/576 [00:29<05:12,  1.69it/s]Train:   9%|▊         | 49/576 [00:29<05:01,  1.75it/s]Train:   9%|▊         | 50/576 [00:30<05:00,  1.75it/s]                                                       {'loss': 1.15329294, 'token_acc': 0.69023519, 'grad_norm': 3.41821814, 'learning_rate': 1.971e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.109132, 'epoch': 0.09, 'global_step/max_steps': '50/576', 'percentage': '8.68%', 'elapsed_time': '30s', 'remaining_time': '5m 18s'}
+Train:   9%|▊         | 50/576 [00:30<05:00,  1.75it/s]Train:   9%|▊         | 50/576 [00:30<05:00,  1.75it/s]Train:   9%|▉         | 51/576 [00:30<04:44,  1.85it/s]Train:   9%|▉         | 52/576 [00:31<04:59,  1.75it/s]Train:   9%|▉         | 53/576 [00:31<04:51,  1.79it/s]Train:   9%|▉         | 54/576 [00:32<05:04,  1.71it/s]Train:  10%|▉         | 55/576 [00:32<04:44,  1.83it/s]Train:  10%|▉         | 56/576 [00:33<04:43,  1.84it/s]Train:  10%|▉         | 57/576 [00:34<04:35,  1.88it/s]Train:  10%|█         | 58/576 [00:34<04:59,  1.73it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.54it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.44it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.06it/s]                                                                                                         {'eval_loss': 1.11353922, 'eval_token_acc': 0.72224446, 'eval_runtime': 1.2902, 'eval_samples_per_second': 387.535, 'eval_steps_per_second': 4.65, 'epoch': 0.1, 'global_step/max_steps': '58/576', 'percentage': '10.07%', 'elapsed_time': '36s', 'remaining_time': '5m 21s'}
+Train:  10%|█         | 58/576 [00:36<04:59,  1.73it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.06it/s]Train:  10%|█         | 58/576 [00:36<04:59,  1.73it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.71it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-58
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  10%|█         | 59/576 [00:53<50:57,  5.91s/it]Train:  10%|█         | 60/576 [00:53<37:16,  4.33s/it]                                                       {'loss': 1.15097857, 'token_acc': 0.72460813, 'grad_norm': 2.61772633, 'learning_rate': 1.956e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 0.875111, 'epoch': 0.1, 'global_step/max_steps': '60/576', 'percentage': '10.42%', 'elapsed_time': '53s', 'remaining_time': '7m 42s'}
+Train:  10%|█         | 60/576 [00:53<37:16,  4.33s/it]Train:  10%|█         | 60/576 [00:53<37:16,  4.33s/it]Train:  11%|█         | 61/576 [00:54<27:35,  3.21s/it]Train:  11%|█         | 62/576 [00:54<20:51,  2.43s/it]Train:  11%|█         | 63/576 [00:55<16:00,  1.87s/it]Train:  11%|█         | 64/576 [00:56<12:32,  1.47s/it]Train:  11%|█▏        | 65/576 [00:56<10:23,  1.22s/it]Train:  11%|█▏        | 66/576 [00:57<08:51,  1.04s/it]Train:  12%|█▏        | 67/576 [00:57<07:24,  1.15it/s]Train:  12%|█▏        | 68/576 [00:58<06:28,  1.31it/s]Train:  12%|█▏        | 69/576 [00:58<05:48,  1.46it/s]Train:  12%|█▏        | 70/576 [00:59<05:15,  1.60it/s]                                                       {'loss': 1.14875164, 'token_acc': 0.7077306, 'grad_norm': 3.16013718, 'learning_rate': 1.938e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 0.94457, 'epoch': 0.12, 'global_step/max_steps': '70/576', 'percentage': '12.15%', 'elapsed_time': '59s', 'remaining_time': '7m 8s'}
+Train:  12%|█▏        | 70/576 [00:59<05:15,  1.60it/s]Train:  12%|█▏        | 70/576 [00:59<05:15,  1.60it/s]Train:  12%|█▏        | 71/576 [00:59<05:02,  1.67it/s]Train:  12%|█▎        | 72/576 [01:00<04:52,  1.72it/s]Train:  13%|█▎        | 73/576 [01:00<04:48,  1.75it/s]Train:  13%|█▎        | 74/576 [01:01<04:35,  1.82it/s]Train:  13%|█▎        | 75/576 [01:02<04:49,  1.73it/s]Train:  13%|█▎        | 76/576 [01:02<04:27,  1.87it/s]Train:  13%|█▎        | 77/576 [01:02<04:19,  1.93it/s]Train:  14%|█▎        | 78/576 [01:03<04:25,  1.87it/s]Train:  14%|█▎        | 79/576 [01:04<04:39,  1.78it/s]Train:  14%|█▍        | 80/576 [01:04<04:35,  1.80it/s]                                                       {'loss': 1.13016357, 'token_acc': 0.70583161, 'grad_norm': 2.71143651, 'learning_rate': 1.918e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.005873, 'epoch': 0.14, 'global_step/max_steps': '80/576', 'percentage': '13.89%', 'elapsed_time': '1m 4s', 'remaining_time': '6m 41s'}
+Train:  14%|█▍        | 80/576 [01:04<04:35,  1.80it/s]Train:  14%|█▍        | 80/576 [01:04<04:35,  1.80it/s]Train:  14%|█▍        | 81/576 [01:05<04:25,  1.87it/s]Train:  14%|█▍        | 82/576 [01:05<04:30,  1.82it/s]Train:  14%|█▍        | 83/576 [01:06<04:29,  1.83it/s]Train:  15%|█▍        | 84/576 [01:07<04:48,  1.70it/s]Train:  15%|█▍        | 85/576 [01:07<04:36,  1.78it/s]Train:  15%|█▍        | 86/576 [01:08<04:30,  1.81it/s]Train:  15%|█▌        | 87/576 [01:08<04:34,  1.78it/s]Train:  15%|█▌        | 88/576 [01:09<04:33,  1.78it/s]Train:  15%|█▌        | 89/576 [01:09<04:34,  1.77it/s]Train:  16%|█▌        | 90/576 [01:10<04:24,  1.84it/s]                                                       Train:  16%|█▌        | 90/576 [01:10<04:24,  1.84it/s]{'loss': 1.11112213, 'token_acc': 0.70596062, 'grad_norm': 2.61051154, 'learning_rate': 1.895e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.057962, 'epoch': 0.16, 'global_step/max_steps': '90/576', 'percentage': '15.62%', 'elapsed_time': '1m 10s', 'remaining_time': '6m 19s'}
+Train:  16%|█▌        | 90/576 [01:10<04:24,  1.84it/s]Train:  16%|█▌        | 91/576 [01:10<04:21,  1.85it/s]Train:  16%|█▌        | 92/576 [01:11<04:06,  1.96it/s]Train:  16%|█▌        | 93/576 [01:11<04:03,  1.98it/s]Train:  16%|█▋        | 94/576 [01:12<04:26,  1.81it/s]Train:  16%|█▋        | 95/576 [01:12<04:33,  1.76it/s]Train:  17%|█▋        | 96/576 [01:13<04:42,  1.70it/s]Train:  17%|█▋        | 97/576 [01:14<04:42,  1.70it/s]Train:  17%|█▋        | 98/576 [01:14<04:26,  1.79it/s]Train:  17%|█▋        | 99/576 [01:15<04:21,  1.83it/s]Train:  17%|█▋        | 100/576 [01:15<04:01,  1.97it/s]                                                        {'loss': 1.11460667, 'token_acc': 0.71566265, 'grad_norm': 2.54188967, 'learning_rate': 1.869e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.105587, 'epoch': 0.17, 'global_step/max_steps': '100/576', 'percentage': '17.36%', 'elapsed_time': '1m 15s', 'remaining_time': '5m 59s'}
+Train:  17%|█▋        | 100/576 [01:15<04:01,  1.97it/s]Train:  17%|█▋        | 100/576 [01:15<04:01,  1.97it/s]Train:  18%|█▊        | 101/576 [01:16<04:04,  1.94it/s]Train:  18%|█▊        | 102/576 [01:16<04:14,  1.86it/s]Train:  18%|█▊        | 103/576 [01:17<04:33,  1.73it/s]Train:  18%|█▊        | 104/576 [01:17<04:32,  1.73it/s]Train:  18%|█▊        | 105/576 [01:18<04:50,  1.62it/s]Train:  18%|█▊        | 106/576 [01:19<04:49,  1.62it/s]Train:  19%|█▊        | 107/576 [01:19<04:46,  1.64it/s]Train:  19%|█▉        | 108/576 [01:20<04:40,  1.67it/s]Train:  19%|█▉        | 109/576 [01:20<04:15,  1.82it/s]Train:  19%|█▉        | 110/576 [01:21<04:19,  1.80it/s]                                                        {'loss': 1.10196409, 'token_acc': 0.70481559, 'grad_norm': 2.31592703, 'learning_rate': 1.84e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.142044, 'epoch': 0.19, 'global_step/max_steps': '110/576', 'percentage': '19.10%', 'elapsed_time': '1m 21s', 'remaining_time': '5m 45s'}
+Train:  19%|█▉        | 110/576 [01:21<04:19,  1.80it/s]Train:  19%|█▉        | 110/576 [01:21<04:19,  1.80it/s]Train:  19%|█▉        | 111/576 [01:22<04:46,  1.62it/s]Train:  19%|█▉        | 112/576 [01:22<04:39,  1.66it/s]Train:  20%|█▉        | 113/576 [01:23<04:18,  1.79it/s]Train:  20%|█▉        | 114/576 [01:23<04:09,  1.85it/s]Train:  20%|█▉        | 115/576 [01:24<04:08,  1.85it/s]Train:  20%|██        | 116/576 [01:24<04:21,  1.76it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.61it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.43it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.85it/s]                                                                                                          {'eval_loss': 1.05883944, 'eval_token_acc': 0.73568765, 'eval_runtime': 1.3021, 'eval_samples_per_second': 383.99, 'eval_steps_per_second': 4.608, 'epoch': 0.2, 'global_step/max_steps': '116/576', 'percentage': '20.14%', 'elapsed_time': '1m 26s', 'remaining_time': '5m 42s'}
+Train:  20%|██        | 116/576 [01:26<04:21,  1.76it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.85it/s]Train:  20%|██        | 116/576 [01:26<04:21,  1.76it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.58it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-116
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  20%|██        | 117/576 [01:42<44:10,  5.77s/it]Train:  20%|██        | 118/576 [01:43<32:16,  4.23s/it]Train:  21%|██        | 119/576 [01:44<23:51,  3.13s/it]Train:  21%|██        | 120/576 [01:44<17:48,  2.34s/it]                                                        {'loss': 1.10753803, 'token_acc': 0.7252096, 'grad_norm': 2.30762529, 'learning_rate': 1.809e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.005077, 'epoch': 0.21, 'global_step/max_steps': '120/576', 'percentage': '20.83%', 'elapsed_time': '1m 44s', 'remaining_time': '6m 37s'}
+Train:  21%|██        | 120/576 [01:44<17:48,  2.34s/it]Train:  21%|██        | 120/576 [01:44<17:48,  2.34s/it]Train:  21%|██        | 121/576 [01:45<13:54,  1.84s/it]Train:  21%|██        | 122/576 [01:45<11:08,  1.47s/it]Train:  21%|██▏       | 123/576 [01:46<08:58,  1.19s/it]Train:  22%|██▏       | 124/576 [01:46<07:29,  1.01it/s]Train:  22%|██▏       | 125/576 [01:47<06:34,  1.14it/s]Train:  22%|██▏       | 126/576 [01:48<05:46,  1.30it/s]Train:  22%|██▏       | 127/576 [01:48<05:06,  1.46it/s]Train:  22%|██▏       | 128/576 [01:49<05:10,  1.44it/s]Train:  22%|██▏       | 129/576 [01:49<04:43,  1.57it/s]Train:  23%|██▎       | 130/576 [01:50<04:28,  1.66it/s]                                                        {'loss': 1.11092796, 'token_acc': 0.71568627, 'grad_norm': 2.74145222, 'learning_rate': 1.775e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.039335, 'epoch': 0.23, 'global_step/max_steps': '130/576', 'percentage': '22.57%', 'elapsed_time': '1m 50s', 'remaining_time': '6m 18s'}
+Train:  23%|██▎       | 130/576 [01:50<04:28,  1.66it/s]Train:  23%|██▎       | 130/576 [01:50<04:28,  1.66it/s]Train:  23%|██▎       | 131/576 [01:50<04:18,  1.72it/s]Train:  23%|██▎       | 132/576 [01:51<04:43,  1.56it/s]Train:  23%|██▎       | 133/576 [01:52<04:29,  1.64it/s]Train:  23%|██▎       | 134/576 [01:52<04:25,  1.66it/s]Train:  23%|██▎       | 135/576 [01:53<04:06,  1.79it/s]Train:  24%|██▎       | 136/576 [01:53<04:13,  1.74it/s]Train:  24%|██▍       | 137/576 [01:54<04:05,  1.79it/s]Train:  24%|██▍       | 138/576 [01:54<04:12,  1.73it/s]Train:  24%|██▍       | 139/576 [01:55<04:05,  1.78it/s]Train:  24%|██▍       | 140/576 [01:56<04:07,  1.76it/s]                                                        {'loss': 1.05044975, 'token_acc': 0.73300768, 'grad_norm': 2.65658998, 'learning_rate': 1.739e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.070109, 'epoch': 0.24, 'global_step/max_steps': '140/576', 'percentage': '24.31%', 'elapsed_time': '1m 56s', 'remaining_time': '6m 1s'}
+Train:  24%|██▍       | 140/576 [01:56<04:07,  1.76it/s]Train:  24%|██▍       | 140/576 [01:56<04:07,  1.76it/s]Train:  24%|██▍       | 141/576 [01:56<04:05,  1.77it/s]Train:  25%|██▍       | 142/576 [01:57<03:58,  1.82it/s]Train:  25%|██▍       | 143/576 [01:57<03:50,  1.88it/s]Train:  25%|██▌       | 144/576 [01:58<04:10,  1.72it/s]Train:  25%|██▌       | 145/576 [01:58<04:07,  1.74it/s]Train:  25%|██▌       | 146/576 [01:59<04:15,  1.68it/s]Train:  26%|██��       | 147/576 [01:59<03:59,  1.79it/s]Train:  26%|██▌       | 148/576 [02:00<04:25,  1.61it/s]Train:  26%|██▌       | 149/576 [02:01<04:14,  1.68it/s]Train:  26%|██▌       | 150/576 [02:01<04:10,  1.70it/s]                                                        {'loss': 1.06343126, 'token_acc': 0.72577078, 'grad_norm': 2.43761706, 'learning_rate': 1.701e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.097821, 'epoch': 0.26, 'global_step/max_steps': '150/576', 'percentage': '26.04%', 'elapsed_time': '2m 1s', 'remaining_time': '5m 45s'}
+Train:  26%|██▌       | 150/576 [02:01<04:10,  1.70it/s]Train:  26%|██▌       | 150/576 [02:01<04:10,  1.70it/s]Train:  26%|██▌       | 151/576 [02:02<04:10,  1.70it/s]Train:  26%|██▋       | 152/576 [02:03<04:13,  1.67it/s]Train:  27%|██▋       | 153/576 [02:03<04:05,  1.73it/s]Train:  27%|██▋       | 154/576 [02:04<04:18,  1.63it/s]Train:  27%|██▋       | 155/576 [02:04<04:12,  1.67it/s]Train:  27%|██▋       | 156/576 [02:05<04:08,  1.69it/s]Train:  27%|██▋       | 157/576 [02:05<03:58,  1.76it/s]Train:  27%|██▋       | 158/576 [02:06<03:47,  1.84it/s]Train:  28%|██▊       | 159/576 [02:07<03:55,  1.77it/s]Train:  28%|██▊       | 160/576 [02:07<04:03,  1.71it/s]                                                        {'loss': 1.0687252, 'token_acc': 0.73549063, 'grad_norm': 2.07366014, 'learning_rate': 1.661e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.123085, 'epoch': 0.28, 'global_step/max_steps': '160/576', 'percentage': '27.78%', 'elapsed_time': '2m 7s', 'remaining_time': '5m 31s'}
+Train:  28%|██▊       | 160/576 [02:07<04:03,  1.71it/s]Train:  28%|██▊       | 160/576 [02:07<04:03,  1.71it/s]Train:  28%|██▊       | 161/576 [02:08<04:02,  1.71it/s]Train:  28%|██▊       | 162/576 [02:08<04:05,  1.68it/s]Train:  28%|██▊       | 163/576 [02:09<04:10,  1.65it/s]Train:  28%|██▊       | 164/576 [02:10<04:05,  1.68it/s]Train:  29%|██▊       | 165/576 [02:10<03:49,  1.79it/s]Train:  29%|██▉       | 166/576 [02:11<04:14,  1.61it/s]Train:  29%|██▉       | 167/576 [02:11<04:16,  1.59it/s]Train:  29%|██▉       | 168/576 [02:12<04:04,  1.67it/s]Train:  29%|██▉       | 169/576 [02:13<04:24,  1.54it/s]Train:  30%|██▉       | 170/576 [02:13<04:16,  1.58it/s]                                                        {'loss': 1.05881166, 'token_acc': 0.72149436, 'grad_norm': 2.58424926, 'learning_rate': 1.619e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.143713, 'epoch': 0.3, 'global_step/max_steps': '170/576', 'percentage': '29.51%', 'elapsed_time': '2m 13s', 'remaining_time': '5m 19s'}
+Train:  30%|██▉       | 170/576 [02:13<04:16,  1.58it/s]Train:  30%|██▉       | 170/576 [02:13<04:16,  1.58it/s]Train:  30%|██▉       | 171/576 [02:14<04:03,  1.66it/s]Train:  30%|██▉       | 172/576 [02:14<03:59,  1.69it/s]Train:  30%|███       | 173/576 [02:15<04:01,  1.67it/s]Train:  30%|███       | 174/576 [02:16<04:06,  1.63it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.59it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.43it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.79it/s]                                                                                                          {'eval_loss': 1.00839376, 'eval_token_acc': 0.74685518, 'eval_runtime': 1.309, 'eval_samples_per_second': 381.957, 'eval_steps_per_second': 4.583, 'epoch': 0.3, 'global_step/max_steps': '174/576', 'percentage': '30.21%', 'elapsed_time': '2m 17s', 'remaining_time': '5m 17s'}
+Train:  30%|███       | 174/576 [02:17<04:06,  1.63it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.79it/s]Train:  30%|███       | 174/576 [02:17<04:06,  1.63it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.50it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-174
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  30%|███       | 175/576 [02:34<38:57,  5.83s/it]Train:  31%|███       | 176/576 [02:34<28:22,  4.26s/it]Train:  31%|███       | 177/576 [02:35<20:41,  3.11s/it]Train:  31%|███       | 178/576 [02:35<15:29,  2.33s/it]Train:  31%|███       | 179/576 [02:36<12:05,  1.83s/it]Train:  31%|███▏      | 180/576 [02:36<09:26,  1.43s/it]                                                        {'loss': 1.04887629, 'token_acc': 0.72400621, 'grad_norm': 2.55758142, 'learning_rate': 1.574e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.048375, 'epoch': 0.31, 'global_step/max_steps': '180/576', 'percentage': '31.25%', 'elapsed_time': '2m 36s', 'remaining_time': '5m 45s'}
+Train:  31%|███▏      | 180/576 [02:36<09:26,  1.43s/it]Train:  31%|███▏      | 180/576 [02:36<09:26,  1.43s/it]Train:  31%|███▏      | 181/576 [02:37<07:34,  1.15s/it]Train:  32%|███▏      | 182/576 [02:37<06:25,  1.02it/s]Train:  32%|███▏      | 183/576 [02:38<05:44,  1.14it/s]Train:  32%|███▏      | 184/576 [02:39<05:03,  1.29it/s]Train:  32%|███▏      | 185/576 [02:39<04:31,  1.44it/s]Train:  32%|███▏      | 186/576 [02:40<04:29,  1.45it/s]Train:  32%|███▏      | 187/576 [02:40<04:09,  1.56it/s]Train:  33%|███▎      | 188/576 [02:41<04:05,  1.58it/s]Train:  33%|███▎      | 189/576 [02:42<03:56,  1.64it/s]Train:  33%|███▎      | 190/576 [02:42<04:00,  1.60it/s]                                                        {'loss': 1.06350975, 'token_acc': 0.74087176, 'grad_norm': 2.02499986, 'learning_rate': 1.528e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.070528, 'epoch': 0.33, 'global_step/max_steps': '190/576', 'percentage': '32.99%', 'elapsed_time': '2m 42s', 'remaining_time': '5m 30s'}
+Train:  33%|███▎      | 190/576 [02:42<04:00,  1.60it/s]Train:  33%|███▎      | 190/576 [02:42<04:00,  1.60it/s]Train:  33%|███▎      | 191/576 [02:43<03:49,  1.67it/s]Train:  33%|███▎      | 192/576 [02:43<03:39,  1.75it/s]Train:  34%|███▎      | 193/576 [02:44<03:30,  1.82it/s]Train:  34%|███▎      | 194/576 [02:44<03:37,  1.76it/s]Train:  34%|███▍      | 195/576 [02:45<03:46,  1.68it/s]Train:  34%|███▍      | 196/576 [02:45<03:32,  1.79it/s]Train:  34%|███▍      | 197/576 [02:46<03:47,  1.67it/s]Train:  34%|███▍      | 198/576 [02:47<04:05,  1.54it/s]Train:  35%|███▍      | 199/576 [02:47<03:52,  1.62it/s]Train:  35%|███▍      | 200/576 [02:48<03:41,  1.70it/s]                                                        {'loss': 1.02220974, 'token_acc': 0.75109617, 'grad_norm': 2.30510831, 'learning_rate': 1.481e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.091129, 'epoch': 0.35, 'global_step/max_steps': '200/576', 'percentage': '34.72%', 'elapsed_time': '2m 48s', 'remaining_time': '5m 16s'}
+Train:  35%|███▍      | 200/576 [02:48<03:41,  1.70it/s]Train:  35%|███▍      | 200/576 [02:48<03:41,  1.70it/s]Train:  35%|███▍      | 201/576 [02:49<03:33,  1.75it/s]Train:  35%|███▌      | 202/576 [02:49<03:28,  1.79it/s]Train:  35%|███▌      | 203/576 [02:49<03:16,  1.89it/s]Train:  35%|███▌      | 204/576 [02:50<03:23,  1.83it/s]Train:  36%|███▌      | 205/576 [02:51<03:24,  1.82it/s]Train:  36%|███▌      | 206/576 [02:51<03:48,  1.62it/s]Train:  36%|███▌      | 207/576 [02:52<03:37,  1.70it/s]Train:  36%|███▌      | 208/576 [02:53<03:34,  1.71it/s]Train:  36%|███▋      | 209/576 [02:53<03:31,  1.73it/s]Train:  36%|███▋      | 210/576 [02:54<03:20,  1.83it/s]                                                        {'loss': 1.05668774, 'token_acc': 0.75872353, 'grad_norm': 2.54755783, 'learning_rate': 1.432e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.111894, 'epoch': 0.36, 'global_step/max_steps': '210/576', 'percentage': '36.46%', 'elapsed_time': '2m 54s', 'remaining_time': '5m 3s'}
+Train:  36%|███▋      | 210/576 [02:54<03:20,  1.83it/s]Train:  36%|███▋      | 210/576 [02:54<03:20,  1.83it/s]Train:  37%|███▋      | 211/576 [02:54<03:23,  1.80it/s]Train:  37%|███▋      | 212/576 [02:55<03:23,  1.79it/s]Train:  37%|███▋      | 213/576 [02:55<03:19,  1.82it/s]Train:  37%|███▋      | 214/576 [02:56<03:20,  1.81it/s]Train:  37%|███▋      | 215/576 [02:56<03:09,  1.90it/s]Train:  38%|███▊      | 216/576 [02:57<03:07,  1.91it/s]Train:  38%|███▊      | 217/576 [02:57<03:02,  1.97it/s]Train:  38%|███▊      | 218/576 [02:58<03:16,  1.83it/s]Train:  38%|███▊      | 219/576 [02:59<03:26,  1.73it/s]Train:  38%|███▊      | 220/576 [02:59<03:20,  1.77it/s]                                                        {'loss': 1.05190687, 'token_acc': 0.72497027, 'grad_norm': 2.52121735, 'learning_rate': 1.381e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.131854, 'epoch': 0.38, 'global_step/max_steps': '220/576', 'percentage': '38.19%', 'elapsed_time': '2m 59s', 'remaining_time': '4m 50s'}
+Train:  38%|███▊      | 220/576 [02:59<03:20,  1.77it/s]Train:  38%|███▊      | 220/576 [02:59<03:20,  1.77it/s]Train:  38%|███▊      | 221/576 [03:00<03:34,  1.65it/s]Train:  39%|███▊      | 222/576 [03:00<03:25,  1.72it/s]Train:  39%|███▊      | 223/576 [03:01<03:24,  1.72it/s]Train:  39%|███▉      | 224/576 [03:02<03:32,  1.66it/s]Train:  39%|███▉      | 225/576 [03:02<03:28,  1.68it/s]Train:  39%|███▉      | 226/576 [03:03<03:20,  1.74it/s]Train:  39%|███▉      | 227/576 [03:03<03:15,  1.79it/s]Train:  40%|███▉      | 228/576 [03:04<03:04,  1.89it/s]Train:  40%|███▉      | 229/576 [03:04<03:17,  1.76it/s]Train:  40%|███▉      | 230/576 [03:05<03:21,  1.71it/s]                                                        {'loss': 1.05094137, 'token_acc': 0.72964035, 'grad_norm': 2.57324004, 'learning_rate': 1.33e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.148874, 'epoch': 0.4, 'global_step/max_steps': '230/576', 'percentage': '39.93%', 'elapsed_time': '3m 5s', 'remaining_time': '4m 38s'}
+Train:  40%|███▉      | 230/576 [03:05<03:21,  1.71it/s]Train:  40%|███▉      | 230/576 [03:05<03:21,  1.71it/s]Train:  40%|████      | 231/576 [03:05<03:21,  1.71it/s]Train:  40%|████      | 232/576 [03:06<03:32,  1.62it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.63it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.47it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.73it/s]                                                                                                          {'eval_loss': 0.9733482, 'eval_token_acc': 0.75614741, 'eval_runtime': 1.3164, 'eval_samples_per_second': 379.814, 'eval_steps_per_second': 4.558, 'epoch': 0.4, 'global_step/max_steps': '232/576', 'percentage': '40.28%', 'elapsed_time': '3m 7s', 'remaining_time': '4m 38s'}
+Train:  40%|████      | 232/576 [03:07<03:32,  1.62it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.73it/s]Train:  40%|████      | 232/576 [03:07<03:32,  1.62it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.47it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-232
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  40%|████      | 233/576 [03:25<35:19,  6.18s/it]Train:  41%|████      | 234/576 [03:26<25:45,  4.52s/it]Train:  41%|████      | 235/576 [03:27<19:08,  3.37s/it]Train:  41%|████      | 236/576 [03:27<14:26,  2.55s/it]Train:  41%|████      | 237/576 [03:28<11:01,  1.95s/it]Train:  41%|████▏     | 238/576 [03:29<08:52,  1.58s/it]Train:  41%|████▏     | 239/576 [03:29<07:00,  1.25s/it]Train:  42%|████▏     | 240/576 [03:30<05:46,  1.03s/it]                                                        {'loss': 0.97692032, 'token_acc': 0.75384225, 'grad_norm': 2.46891141, 'learning_rate': 1.277e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.067325, 'epoch': 0.42, 'global_step/max_steps': '240/576', 'percentage': '41.67%', 'elapsed_time': '3m 30s', 'remaining_time': '4m 54s'}
+Train:  42%|████▏     | 240/576 [03:30<05:46,  1.03s/it]Train:  42%|████▏     | 240/576 [03:30<05:46,  1.03s/it]Train:  42%|████▏     | 241/576 [03:30<05:07,  1.09it/s]Train:  42%|████▏     | 242/576 [03:31<04:37,  1.20it/s]Train:  42%|████▏     | 243/576 [03:31<04:07,  1.35it/s]Train:  42%|████▏     | 244/576 [03:32<03:54,  1.42it/s]Train:  43%|████▎     | 245/576 [03:33<03:40,  1.50it/s]Train:  43%|████▎     | 246/576 [03:33<03:21,  1.63it/s]Train:  43%|████▎     | 247/576 [03:34<03:06,  1.76it/s]Train:  43%|████▎     | 248/576 [03:34<03:01,  1.81it/s]Train:  43%|████▎     | 249/576 [03:35<03:11,  1.71it/s]Train:  43%|████▎     | 250/576 [03:35<03:13,  1.69it/s]                                                        {'loss': 1.00677357, 'token_acc': 0.75623202, 'grad_norm': 2.23032236, 'learning_rate': 1.224e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.084058, 'epoch': 0.43, 'global_step/max_steps': '250/576', 'percentage': '43.40%', 'elapsed_time': '3m 35s', 'remaining_time': '4m 41s'}
+Train:  43%|████▎     | 250/576 [03:35<03:13,  1.69it/s]Train:  43%|████▎     | 250/576 [03:35<03:13,  1.69it/s]Train:  44%|████▎     | 251/576 [03:36<03:16,  1.66it/s]Train:  44%|████▍     | 252/576 [03:37<03:22,  1.60it/s]Train:  44%|████▍     | 253/576 [03:37<03:15,  1.65it/s]Train:  44%|████▍     | 254/576 [03:38<03:04,  1.74it/s]Train:  44%|████▍     | 255/576 [03:38<02:51,  1.88it/s]Train:  44%|████▍     | 256/576 [03:39<02:54,  1.83it/s]Train:  45%|████▍     | 257/576 [03:39<03:05,  1.72it/s]Train:  45%|████▍     | 258/576 [03:40<03:04,  1.72it/s]Train:  45%|████▍     | 259/576 [03:40<03:03,  1.73it/s]Train:  45%|████▌     | 260/576 [03:41<03:03,  1.72it/s]                                                        {'loss': 1.01661434, 'token_acc': 0.74765985, 'grad_norm': 2.45739126, 'learning_rate': 1.17e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.09986, 'epoch': 0.45, 'global_step/max_steps': '260/576', 'percentage': '45.14%', 'elapsed_time': '3m 41s', 'remaining_time': '4m 29s'}
+Train:  45%|████▌     | 260/576 [03:41<03:03,  1.72it/s]Train:  45%|████▌     | 260/576 [03:41<03:03,  1.72it/s]Train:  45%|████▌     | 261/576 [03:42<03:04,  1.71it/s]Train:  45%|████▌     | 262/576 [03:42<03:03,  1.71it/s]Train:  46%|████▌     | 263/576 [03:43<03:05,  1.68it/s]Train:  46%|████▌     | 264/576 [03:43<02:54,  1.79it/s]Train:  46%|████▌     | 265/576 [03:44<03:13,  1.61it/s]Train:  46%|████▌     | 266/576 [03:45<03:00,  1.71it/s]Train:  46%|████▋     | 267/576 [03:45<02:55,  1.76it/s]Train:  47%|████▋     | 268/576 [03:46<02:55,  1.76it/s]Train:  47%|████▋     | 269/576 [03:46<02:54,  1.76it/s]Train:  47%|████▋     | 270/576 [03:47<03:01,  1.68it/s]                                                        {'loss': 0.9885828, 'token_acc': 0.74178666, 'grad_norm': 1.99874353, 'learning_rate': 1.115e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.114544, 'epoch': 0.47, 'global_step/max_steps': '270/576', 'percentage': '46.88%', 'elapsed_time': '3m 47s', 'remaining_time': '4m 17s'}
+Train:  47%|████▋     | 270/576 [03:47<03:01,  1.68it/s]Train:  47%|████▋     | 270/576 [03:47<03:01,  1.68it/s]Train:  47%|████▋     | 271/576 [03:47<02:56,  1.73it/s]Train:  47%|████▋     | 272/576 [03:48<02:53,  1.75it/s]Train:  47%|████▋     | 273/576 [03:49<02:51,  1.77it/s]Train:  48%|████▊     | 274/576 [03:49<02:45,  1.82it/s]Train:  48%|████▊     | 275/576 [03:50<03:05,  1.63it/s]Train:  48%|████▊     | 276/576 [03:50<02:55,  1.70it/s]Train:  48%|████▊     | 277/576 [03:51<03:05,  1.62it/s]Train:  48%|████▊     | 278/576 [03:52<02:56,  1.69it/s]Train:  48%|████▊     | 279/576 [03:52<02:57,  1.67it/s]Train:  49%|████▊     | 280/576 [03:53<02:54,  1.69it/s]                                                        {'loss': 1.02517633, 'token_acc': 0.74498021, 'grad_norm': 2.3875699, 'learning_rate': 1.061e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.128527, 'epoch': 0.49, 'global_step/max_steps': '280/576', 'percentage': '48.61%', 'elapsed_time': '3m 53s', 'remaining_time': '4m 6s'}
+Train:  49%|████▊     | 280/576 [03:53<02:54,  1.69it/s]Train:  49%|████▊     | 280/576 [03:53<02:54,  1.69it/s]Train:  49%|████▉     | 281/576 [03:53<02:46,  1.77it/s]Train:  49%|████▉     | 282/576 [03:54<02:50,  1.73it/s]Train:  49%|████▉     | 283/576 [03:55<03:00,  1.63it/s]Train:  49%|████▉     | 284/576 [03:55<03:01,  1.61it/s]Train:  49%|████▉     | 285/576 [03:56<02:50,  1.71it/s]Train:  50%|████▉     | 286/576 [03:56<02:44,  1.76it/s]Train:  50%|████▉     | 287/576 [03:57<02:42,  1.77it/s]Train:  50%|█████     | 288/576 [03:57<02:43,  1.76it/s]Train:  50%|█████     | 289/576 [03:58<02:51,  1.67it/s]Train:  50%|█████     | 290/576 [03:59<02:45,  1.73it/s]                                                        {'loss': 0.99593086, 'token_acc': 0.75641026, 'grad_norm': 2.25379562, 'learning_rate': 1.006e-05, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.142079, 'epoch': 0.5, 'global_step/max_steps': '290/576', 'percentage': '50.35%', 'elapsed_time': '3m 59s', 'remaining_time': '3m 55s'}
+Train:  50%|█████     | 290/576 [03:59<02:45,  1.73it/s]Train:  50%|█████     | 290/576 [03:59<02:45,  1.73it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.59it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.45it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.82it/s]                                                                                                          {'eval_loss': 0.93815315, 'eval_token_acc': 0.76282686, 'eval_runtime': 1.3133, 'eval_samples_per_second': 380.716, 'eval_steps_per_second': 4.569, 'epoch': 0.5, 'global_step/max_steps': '290/576', 'percentage': '50.35%', 'elapsed_time': '4m 0s', 'remaining_time': '3m 57s'}
+Train:  50%|█████     | 290/576 [04:00<02:45,  1.73it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.82it/s]Train:  50%|█████     | 290/576 [04:00<02:45,  1.73it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.55it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-290
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  51%|█████     | 291/576 [04:18<28:57,  6.10s/it]Train:  51%|█████     | 292/576 [04:18<20:59,  4.43s/it]Train:  51%|█████     | 293/576 [04:19<15:27,  3.28s/it]Train:  51%|█████     | 294/576 [04:19<11:37,  2.47s/it]Train:  51%|█████     | 295/576 [04:20<08:50,  1.89s/it]Train:  51%|█████▏    | 296/576 [04:21<07:08,  1.53s/it]Train:  52%|█████▏    | 297/576 [04:21<05:56,  1.28s/it]Train:  52%|█████▏    | 298/576 [04:22<04:56,  1.07s/it]Train:  52%|█████▏    | 299/576 [04:22<04:25,  1.04it/s]Train:  52%|█████▏    | 300/576 [04:23<03:48,  1.21it/s]                                                        {'loss': 0.99728003, 'token_acc': 0.76250388, 'grad_norm': 2.56651878, 'learning_rate': 9.5e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.077855, 'epoch': 0.52, 'global_step/max_steps': '300/576', 'percentage': '52.08%', 'elapsed_time': '4m 23s', 'remaining_time': '4m 2s'}
+Train:  52%|█████▏    | 300/576 [04:23<03:48,  1.21it/s]Train:  52%|█████▏    | 300/576 [04:23<03:48,  1.21it/s]Train:  52%|█████▏    | 301/576 [04:24<03:24,  1.34it/s]Train:  52%|█████▏    | 302/576 [04:24<03:22,  1.35it/s]Train:  53%|█████▎    | 303/576 [04:25<03:09,  1.44it/s]Train:  53%|█████▎    | 304/576 [04:25<02:53,  1.57it/s]Train:  53%|█████▎    | 305/576 [04:26<02:44,  1.65it/s]Train:  53%|█████▎    | 306/576 [04:26<02:35,  1.73it/s]Train:  53%|█████▎    | 307/576 [04:27<02:34,  1.74it/s]Train:  53%|█████▎    | 308/576 [04:28<02:39,  1.69it/s]Train:  54%|█████▎    | 309/576 [04:28<02:40,  1.67it/s]Train:  54%|█████▍    | 310/576 [04:29<02:33,  1.73it/s]                                                        {'loss': 0.96963453, 'token_acc': 0.76961512, 'grad_norm': 2.67119718, 'learning_rate': 8.95e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.091167, 'epoch': 0.54, 'global_step/max_steps': '310/576', 'percentage': '53.82%', 'elapsed_time': '4m 29s', 'remaining_time': '3m 51s'}
+Train:  54%|█████▍    | 310/576 [04:29<02:33,  1.73it/s]Train:  54%|█████▍    | 310/576 [04:29<02:33,  1.73it/s]Train:  54%|█████▍    | 311/576 [04:29<02:29,  1.77it/s]Train:  54%|█████▍    | 312/576 [04:30<02:26,  1.80it/s]Train:  54%|█████▍    | 313/576 [04:30<02:28,  1.77it/s]Train:  55%|█████▍    | 314/576 [04:31<02:27,  1.78it/s]Train:  55%|█████▍    | 315/576 [04:32<02:28,  1.75it/s]Train:  55%|█████▍    | 316/576 [04:32<02:21,  1.84it/s]Train:  55%|█████▌    | 317/576 [04:33<02:23,  1.80it/s]Train:  55%|█████▌    | 318/576 [04:33<02:24,  1.79it/s]Train:  55%|█████▌    | 319/576 [04:34<02:33,  1.68it/s]Train:  56%|█████▌    | 320/576 [04:34<02:27,  1.73it/s]                                                        {'loss': 1.01511145, 'token_acc': 0.73065327, 'grad_norm': 2.56703401, 'learning_rate': 8.41e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.104419, 'epoch': 0.56, 'global_step/max_steps': '320/576', 'percentage': '55.56%', 'elapsed_time': '4m 34s', 'remaining_time': '3m 39s'}
+Train:  56%|█████▌    | 320/576 [04:34<02:27,  1.73it/s]Train:  56%|█████▌    | 320/576 [04:34<02:27,  1.73it/s]Train:  56%|█████▌    | 321/576 [04:35<02:24,  1.77it/s]Train:  56%|█████▌    | 322/576 [04:35<02:20,  1.81it/s]Train:  56%|█████▌    | 323/576 [04:36<02:14,  1.89it/s]Train:  56%|█████▋    | 324/576 [04:37<02:16,  1.85it/s]Train:  56%|█████▋    | 325/576 [04:37<02:11,  1.91it/s]Train:  57%|█████▋    | 326/576 [04:38<02:11,  1.90it/s]Train:  57%|█████▋    | 327/576 [04:38<02:08,  1.95it/s]Train:  57%|█████▋    | 328/576 [04:39<02:08,  1.92it/s]Train:  57%|█████▋    | 329/576 [04:39<02:12,  1.87it/s]Train:  57%|█████▋    | 330/576 [04:40<02:15,  1.82it/s]                                                        {'loss': 0.91283436, 'token_acc': 0.76599416, 'grad_norm': 2.33123398, 'learning_rate': 7.87e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.118493, 'epoch': 0.57, 'global_step/max_steps': '330/576', 'percentage': '57.29%', 'elapsed_time': '4m 40s', 'remaining_time': '3m 28s'}
+Train:  57%|█████▋    | 330/576 [04:40<02:15,  1.82it/s]Train:  57%|█████▋    | 330/576 [04:40<02:15,  1.82it/s]Train:  57%|█████▋    | 331/576 [04:40<02:12,  1.84it/s]Train:  58%|█████▊    | 332/576 [04:41<02:07,  1.91it/s]Train:  58%|█████▊    | 333/576 [04:41<02:19,  1.74it/s]Train:  58%|█████▊    | 334/576 [04:42<02:11,  1.83it/s]Train:  58%|█████▊    | 335/576 [04:42<02:13,  1.80it/s]Train:  58%|█████▊    | 336/576 [04:43<02:14,  1.79it/s]Train:  59%|█████▊    | 337/576 [04:44<02:21,  1.69it/s]Train:  59%|█████▊    | 338/576 [04:44<02:15,  1.76it/s]Train:  59%|█████▉    | 339/576 [04:45<02:09,  1.83it/s]Train:  59%|█████▉    | 340/576 [04:45<02:13,  1.77it/s]                                                        {'loss': 0.94972649, 'token_acc': 0.76425007, 'grad_norm': 2.17938685, 'learning_rate': 7.33e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.130908, 'epoch': 0.59, 'global_step/max_steps': '340/576', 'percentage': '59.03%', 'elapsed_time': '4m 45s', 'remaining_time': '3m 18s'}
+Train:  59%|█████▉    | 340/576 [04:45<02:13,  1.77it/s]Train:  59%|█████▉    | 340/576 [04:45<02:13,  1.77it/s]Train:  59%|█████▉    | 341/576 [04:46<02:16,  1.72it/s]Train:  59%|█████▉    | 342/576 [04:46<02:10,  1.80it/s]Train:  60%|█████▉    | 343/576 [04:47<02:23,  1.62it/s]Train:  60%|█████▉    | 344/576 [04:48<02:17,  1.68it/s]Train:  60%|█████▉    | 345/576 [04:48<02:11,  1.76it/s]Train:  60%|██████    | 346/576 [04:49<02:00,  1.90it/s]Train:  60%|██████    | 347/576 [04:49<02:01,  1.89it/s]Train:  60%|██████    | 348/576 [04:50<02:05,  1.82it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.61it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.46it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.53it/s]                                                                                                          {'eval_loss': 0.90752518, 'eval_token_acc': 0.77022272, 'eval_runtime': 1.3401, 'eval_samples_per_second': 373.119, 'eval_steps_per_second': 4.477, 'epoch': 0.6, 'global_step/max_steps': '348/576', 'percentage': '60.42%', 'elapsed_time': '4m 51s', 'remaining_time': '3m 11s'}
+Train:  60%|██████    | 348/576 [04:51<02:05,  1.82it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.53it/s]Train:  60%|██████    | 348/576 [04:51<02:05,  1.82it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.28it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-348
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  61%|██████    | 349/576 [05:09<23:09,  6.12s/it]Train:  61%|██████    | 350/576 [05:09<16:43,  4.44s/it]                                                        {'loss': 0.9384264, 'token_acc': 0.77098171, 'grad_norm': 2.39310098, 'learning_rate': 6.81e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.077673, 'epoch': 0.61, 'global_step/max_steps': '350/576', 'percentage': '60.76%', 'elapsed_time': '5m 9s', 'remaining_time': '3m 20s'}
+Train:  61%|██████    | 350/576 [05:09<16:43,  4.44s/it]Train:  61%|██████    | 350/576 [05:09<16:43,  4.44s/it]Train:  61%|██████    | 351/576 [05:10<12:14,  3.26s/it]Train:  61%|██████    | 352/576 [05:11<09:14,  2.48s/it]Train:  61%|██████▏   | 353/576 [05:11<07:01,  1.89s/it]Train:  61%|██████▏   | 354/576 [05:12<05:28,  1.48s/it]Train:  62%|██████▏   | 355/576 [05:12<04:29,  1.22s/it]Train:  62%|██████▏   | 356/576 [05:13<03:43,  1.01s/it]Train:  62%|██████▏   | 357/576 [05:13<03:15,  1.12it/s]Train:  62%|██████▏   | 358/576 [05:14<02:49,  1.29it/s]Train:  62%|██████▏   | 359/576 [05:14<02:34,  1.40it/s]Train:  62%|██████▎   | 360/576 [05:15<02:26,  1.48it/s]                                                        {'loss': 0.93376245, 'token_acc': 0.75048331, 'grad_norm': 2.09770656, 'learning_rate': 6.29e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.089594, 'epoch': 0.62, 'global_step/max_steps': '360/576', 'percentage': '62.50%', 'elapsed_time': '5m 15s', 'remaining_time': '3m 9s'}
+Train:  62%|██████▎   | 360/576 [05:15<02:26,  1.48it/s]Train:  62%|██████▎   | 360/576 [05:15<02:26,  1.48it/s]Train:  63%|██████▎   | 361/576 [05:16<02:13,  1.61it/s]Train:  63%|██████▎   | 362/576 [05:16<02:11,  1.63it/s]Train:  63%|██████▎   | 363/576 [05:17<02:01,  1.75it/s]Train:  63%|██████▎   | 364/576 [05:17<02:02,  1.73it/s]Train:  63%|██████▎   | 365/576 [05:18<01:58,  1.78it/s]Train:  64%|██████▎   | 366/576 [05:18<01:59,  1.76it/s]Train:  64%|██████▎   | 367/576 [05:19<01:56,  1.79it/s]Train:  64%|██████▍   | 368/576 [05:19<01:51,  1.86it/s]Train:  64%|��█████▍   | 369/576 [05:20<01:50,  1.87it/s]Train:  64%|██████▍   | 370/576 [05:20<01:49,  1.88it/s]                                                        {'loss': 0.99854689, 'token_acc': 0.74321536, 'grad_norm': 2.55897164, 'learning_rate': 5.78e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.102042, 'epoch': 0.64, 'global_step/max_steps': '370/576', 'percentage': '64.24%', 'elapsed_time': '5m 20s', 'remaining_time': '2m 58s'}
+Train:  64%|██████▍   | 370/576 [05:20<01:49,  1.88it/s]Train:  64%|██████▍   | 370/576 [05:20<01:49,  1.88it/s]Train:  64%|██████▍   | 371/576 [05:21<01:46,  1.93it/s]Train:  65%|██████▍   | 372/576 [05:22<01:52,  1.81it/s]Train:  65%|██████▍   | 373/576 [05:22<01:51,  1.82it/s]Train:  65%|██████▍   | 374/576 [05:23<01:50,  1.83it/s]Train:  65%|██████▌   | 375/576 [05:23<01:51,  1.80it/s]Train:  65%|██████▌   | 376/576 [05:24<01:46,  1.89it/s]Train:  65%|██████▌   | 377/576 [05:24<01:48,  1.83it/s]Train:  66%|██████▌   | 378/576 [05:25<01:44,  1.90it/s]Train:  66%|██████▌   | 379/576 [05:25<01:54,  1.72it/s]Train:  66%|██████▌   | 380/576 [05:26<01:57,  1.67it/s]                                                        {'loss': 0.92499561, 'token_acc': 0.77031549, 'grad_norm': 2.33677197, 'learning_rate': 5.29e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.113083, 'epoch': 0.66, 'global_step/max_steps': '380/576', 'percentage': '65.97%', 'elapsed_time': '5m 26s', 'remaining_time': '2m 48s'}
+Train:  66%|██████▌   | 380/576 [05:26<01:57,  1.67it/s]Train:  66%|██████▌   | 380/576 [05:26<01:57,  1.67it/s]Train:  66%|██████▌   | 381/576 [05:27<01:55,  1.69it/s]Train:  66%|██████▋   | 382/576 [05:27<01:58,  1.64it/s]Train:  66%|██████▋   | 383/576 [05:28<01:52,  1.72it/s]Train:  67%|██████▋   | 384/576 [05:28<01:48,  1.77it/s]Train:  67%|██████▋   | 385/576 [05:29<01:45,  1.81it/s]Train:  67%|██████▋   | 386/576 [05:29<01:47,  1.76it/s]Train:  67%|██████▋   | 387/576 [05:30<01:50,  1.72it/s]Train:  67%|██████▋   | 388/576 [05:31<01:45,  1.78it/s]Train:  68%|██████▊   | 389/576 [05:31<01:49,  1.70it/s]Train:  68%|██████▊   | 390/576 [05:32<01:47,  1.72it/s]                                                        {'loss': 0.94684763, 'token_acc': 0.77717532, 'grad_norm': 2.16214776, 'learning_rate': 4.81e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.12348, 'epoch': 0.68, 'global_step/max_steps': '390/576', 'percentage': '67.71%', 'elapsed_time': '5m 32s', 'remaining_time': '2m 38s'}
+Train:  68%|██████▊   | 390/576 [05:32<01:47,  1.72it/s]Train:  68%|██████▊   | 390/576 [05:32<01:47,  1.72it/s]Train:  68%|██████▊   | 391/576 [05:32<01:47,  1.72it/s]Train:  68%|██████▊   | 392/576 [05:33<01:47,  1.71it/s]Train:  68%|██████▊   | 393/576 [05:34<01:43,  1.76it/s]Train:  68%|██████▊   | 394/576 [05:34<01:46,  1.71it/s]Train:  69%|██████▊   | 395/576 [05:35<01:52,  1.61it/s]Train:  69%|██████▉   | 396/576 [05:35<01:46,  1.69it/s]Train:  69%|██████▉   | 397/576 [05:36<01:42,  1.75it/s]Train:  69%|██████▉   | 398/576 [05:37<01:44,  1.71it/s]Train:  69%|██████▉   | 399/576 [05:37<01:42,  1.73it/s]Train:  69%|██████▉   | 400/576 [05:38<01:36,  1.82it/s]                                                        {'loss': 0.95105591, 'token_acc': 0.75876201, 'grad_norm': 2.43672657, 'learning_rate': 4.35e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.133548, 'epoch': 0.69, 'global_step/max_steps': '400/576', 'percentage': '69.44%', 'elapsed_time': '5m 38s', 'remaining_time': '2m 28s'}
+Train:  69%|██████▉   | 400/576 [05:38<01:36,  1.82it/s]Train:  69%|██████▉   | 400/576 [05:38<01:36,  1.82it/s]Train:  70%|██████▉   | 401/576 [05:38<01:35,  1.83it/s]Train:  70%|██████▉   | 402/576 [05:39<01:32,  1.88it/s]Train:  70%|██████▉   | 403/576 [05:39<01:31,  1.88it/s]Train:  70%|███████   | 404/576 [05:40<01:27,  1.97it/s]Train:  70%|███████   | 405/576 [05:40<01:33,  1.82it/s]Train:  70%|███████   | 406/576 [05:41<01:37,  1.74it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.60it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.45it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.79it/s]                                                                                                          {'eval_loss': 0.88947815, 'eval_token_acc': 0.7752165, 'eval_runtime': 1.4326, 'eval_samples_per_second': 349.022, 'eval_steps_per_second': 4.188, 'epoch': 0.7, 'global_step/max_steps': '406/576', 'percentage': '70.49%', 'elapsed_time': '5m 42s', 'remaining_time': '2m 23s'}
+Train:  70%|███████   | 406/576 [05:42<01:37,  1.74it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.79it/s]Train:  70%|███████   | 406/576 [05:42<01:37,  1.74it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.50it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-406
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  71%|███████   | 407/576 [06:00<16:53,  6.00s/it]Train:  71%|███████   | 408/576 [06:00<12:15,  4.38s/it]Train:  71%|███████   | 409/576 [06:01<08:57,  3.22s/it]Train:  71%|███████   | 410/576 [06:01<06:49,  2.47s/it]                                                        {'loss': 0.95857811, 'token_acc': 0.75379025, 'grad_norm': 2.14144373, 'learning_rate': 3.9e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.088528, 'epoch': 0.71, 'global_step/max_steps': '410/576', 'percentage': '71.18%', 'elapsed_time': '6m 1s', 'remaining_time': '2m 26s'}
+Train:  71%|███████   | 410/576 [06:01<06:49,  2.47s/it]Train:  71%|███████   | 410/576 [06:01<06:49,  2.47s/it]Train:  71%|███████▏  | 411/576 [06:02<05:15,  1.91s/it]Train:  72%|███████▏  | 412/576 [06:02<04:04,  1.49s/it]Train:  72%|███████▏  | 413/576 [06:03<03:15,  1.20s/it]Train:  72%|███████▏  | 414/576 [06:04<02:41,  1.00it/s]Train:  72%|███████▏  | 415/576 [06:04<02:20,  1.15it/s]Train:  72%|███████▏  | 416/576 [06:05<02:13,  1.20it/s]Train:  72%|███████▏  | 417/576 [06:05<02:01,  1.31it/s]Train:  73%|███████▎  | 418/576 [06:06<01:48,  1.46it/s]Train:  73%|███████▎  | 419/576 [06:06<01:41,  1.55it/s]Train:  73%|███████▎  | 420/576 [06:07<01:37,  1.59it/s]                                                        {'loss': 0.92633076, 'token_acc': 0.78247734, 'grad_norm': 2.09863329, 'learning_rate': 3.47e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.098353, 'epoch': 0.73, 'global_step/max_steps': '420/576', 'percentage': '72.92%', 'elapsed_time': '6m 7s', 'remaining_time': '2m 16s'}
+Train:  73%|███████▎  | 420/576 [06:07<01:37,  1.59it/s]Train:  73%|███████▎  | 420/576 [06:07<01:37,  1.59it/s]Train:  73%|███████▎  | 421/576 [06:08<01:32,  1.68it/s]Train:  73%|███████▎  | 422/576 [06:08<01:27,  1.77it/s]Train:  73%|███████▎  | 423/576 [06:09<01:27,  1.75it/s]Train:  74%|███████▎  | 424/576 [06:09<01:26,  1.75it/s]Train:  74%|███████▍  | 425/576 [06:10<01:25,  1.76it/s]Train:  74%|███████▍  | 426/576 [06:10<01:27,  1.72it/s]Train:  74%|███████▍  | 427/576 [06:11<01:26,  1.72it/s]Train:  74%|███████▍  | 428/576 [06:12<01:23,  1.78it/s]Train:  74%|███████▍  | 429/576 [06:12<01:21,  1.80it/s]Train:  75%|███████▍  | 430/576 [06:13<01:20,  1.81it/s]                                                        {'loss': 0.96292439, 'token_acc': 0.74, 'grad_norm': 2.48212934, 'learning_rate': 3.07e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.108481, 'epoch': 0.75, 'global_step/max_steps': '430/576', 'percentage': '74.65%', 'elapsed_time': '6m 13s', 'remaining_time': '2m 6s'}
+Train:  75%|███████▍  | 430/576 [06:13<01:20,  1.81it/s]Train:  75%|███████▍  | 430/576 [06:13<01:20,  1.81it/s]Train:  75%|███████▍  | 431/576 [06:13<01:17,  1.87it/s]Train:  75%|███████▌  | 432/576 [06:14<01:16,  1.88it/s]Train:  75%|███████▌  | 433/576 [06:14<01:15,  1.88it/s]Train:  75%|███████▌  | 434/576 [06:15<01:16,  1.87it/s]Train:  76%|███████▌  | 435/576 [06:15<01:18,  1.79it/s]Train:  76%|███████▌  | 436/576 [06:16<01:22,  1.70it/s]Train:  76%|███████▌  | 437/576 [06:16<01:17,  1.79it/s]Train:  76%|███████▌  | 438/576 [06:17<01:18,  1.76it/s]Train:  76%|███████▌  | 439/576 [06:18<01:19,  1.73it/s]Train:  76%|███████▋  | 440/576 [06:18<01:15,  1.81it/s]                                                        {'loss': 0.93844452, 'token_acc': 0.77675474, 'grad_norm': 2.60922456, 'learning_rate': 2.68e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.118263, 'epoch': 0.76, 'global_step/max_steps': '440/576', 'percentage': '76.39%', 'elapsed_time': '6m 18s', 'remaining_time': '1m 57s'}
+Train:  76%|███████▋  | 440/576 [06:18<01:15,  1.81it/s]Train:  76%|███████▋  | 440/576 [06:18<01:15,  1.81it/s]Train:  77%|███████▋  | 441/576 [06:19<01:13,  1.83it/s]Train:  77%|███████▋  | 442/576 [06:19<01:16,  1.75it/s]Train:  77%|███████▋  | 443/576 [06:20<01:17,  1.71it/s]Train:  77%|███████▋  | 444/576 [06:21<01:17,  1.70it/s]Train:  77%|███████▋  | 445/576 [06:21<01:14,  1.75it/s]Train:  77%|███████▋  | 446/576 [06:22<01:18,  1.66it/s]Train:  78%|███████▊  | 447/576 [06:22<01:14,  1.72it/s]Train:  78%|███████▊  | 448/576 [06:23<01:12,  1.76it/s]Train:  78%|███████▊  | 449/576 [06:23<01:12,  1.74it/s]Train:  78%|███████▊  | 450/576 [06:24<01:09,  1.81it/s]                                                        {'loss': 0.96402092, 'token_acc': 0.7731535, 'grad_norm': 2.3300817, 'learning_rate': 2.32e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.127248, 'epoch': 0.78, 'global_step/max_steps': '450/576', 'percentage': '78.12%', 'elapsed_time': '6m 24s', 'remaining_time': '1m 47s'}
+Train:  78%|███████▊  | 450/576 [06:24<01:09,  1.81it/s]Train:  78%|███████▊  | 450/576 [06:24<01:09,  1.81it/s]Train:  78%|███████▊  | 451/576 [06:24<01:08,  1.84it/s]Train:  78%|███████▊  | 452/576 [06:25<01:06,  1.87it/s]Train:  79%|███████▊  | 453/576 [06:26<01:10,  1.75it/s]Train:  79%|███████▉  | 454/576 [06:27<01:22,  1.48it/s]Train:  79%|███████▉  | 455/576 [06:27<01:18,  1.55it/s]Train:  79%|███████▉  | 456/576 [06:28<01:14,  1.61it/s]Train:  79%|███████▉  | 457/576 [06:28<01:10,  1.68it/s]Train:  80%|███████▉  | 458/576 [06:29<01:08,  1.73it/s]Train:  80%|███████▉  | 459/576 [06:29<01:06,  1.77it/s]Train:  80%|███████▉  | 460/576 [06:30<01:06,  1.74it/s]                                                        {'loss': 0.93896999, 'token_acc': 0.77493637, 'grad_norm': 2.40273523, 'learning_rate': 1.98e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.135351, 'epoch': 0.8, 'global_step/max_steps': '460/576', 'percentage': '79.86%', 'elapsed_time': '6m 30s', 'remaining_time': '1m 38s'}
+Train:  80%|███████▉  | 460/576 [06:30<01:06,  1.74it/s]Train:  80%|███████▉  | 460/576 [06:30<01:06,  1.74it/s]Train:  80%|████████  | 461/576 [06:30<01:04,  1.79it/s]Train:  80%|████████  | 462/576 [06:31<01:05,  1.75it/s]Train:  80%|████████  | 463/576 [06:32<01:06,  1.71it/s]Train:  81%|████████  | 464/576 [06:32<01:06,  1.70it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.57it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.40it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.75it/s]                                                                                                          {'eval_loss': 0.87524951, 'eval_token_acc': 0.77747108, 'eval_runtime': 1.3179, 'eval_samples_per_second': 379.4, 'eval_steps_per_second': 4.553, 'epoch': 0.81, 'global_step/max_steps': '464/576', 'percentage': '80.56%', 'elapsed_time': '6m 34s', 'remaining_time': '1m 35s'}
+Train:  81%|████████  | 464/576 [06:34<01:06,  1.70it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.75it/s]Train:  81%|████████  | 464/576 [06:34<01:06,  1.70it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.48it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-464
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  81%|████████  | 465/576 [06:51<11:12,  6.05s/it]Train:  81%|████████  | 466/576 [06:52<08:05,  4.41s/it]Train:  81%|████████  | 467/576 [06:52<05:58,  3.29s/it]Train:  81%|████████▏ | 468/576 [06:53<04:24,  2.45s/it]Train:  81%|████████▏ | 469/576 [06:53<03:21,  1.88s/it]Train:  82%|████████▏ | 470/576 [06:54<02:42,  1.54s/it]                                                        {'loss': 0.93612146, 'token_acc': 0.76815505, 'grad_norm': 2.1542542, 'learning_rate': 1.66e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.094711, 'epoch': 0.82, 'global_step/max_steps': '470/576', 'percentage': '81.60%', 'elapsed_time': '6m 54s', 'remaining_time': '1m 33s'}
+Train:  82%|████████▏ | 470/576 [06:54<02:42,  1.54s/it]Train:  82%|████████▏ | 470/576 [06:54<02:42,  1.54s/it]Train:  82%|████████▏ | 471/576 [06:55<02:10,  1.24s/it]Train:  82%|████████▏ | 472/576 [06:55<01:48,  1.04s/it]Train:  82%|████████▏ | 473/576 [06:56<01:34,  1.09it/s]Train:  82%|████████▏ | 474/576 [06:56<01:25,  1.20it/s]Train:  82%|████████▏ | 475/576 [06:57<01:15,  1.34it/s]Train:  83%|████████▎ | 476/576 [06:58<01:09,  1.44it/s]Train:  83%|████████▎ | 477/576 [06:58<01:04,  1.53it/s]Train:  83%|████████▎ | 478/576 [06:59<00:58,  1.66it/s]Train:  83%|████████▎ | 479/576 [06:59<00:55,  1.74it/s]Train:  83%|████████▎ | 480/576 [07:00<00:55,  1.72it/s]                                                        {'loss': 0.94844856, 'token_acc': 0.77056026, 'grad_norm': 2.18318248, 'learning_rate': 1.37e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.103444, 'epoch': 0.83, 'global_step/max_steps': '480/576', 'percentage': '83.33%', 'elapsed_time': '7m 0s', 'remaining_time': '1m 24s'}
+Train:  83%|████████▎ | 480/576 [07:00<00:55,  1.72it/s]Train:  83%|████████▎ | 480/576 [07:00<00:55,  1.72it/s]Train:  84%|████████▎ | 481/576 [07:00<00:54,  1.73it/s]Train:  84%|████████▎ | 482/576 [07:01<00:55,  1.70it/s]Train:  84%|████████▍ | 483/576 [07:01<00:54,  1.70it/s]Train:  84%|████████▍ | 484/576 [07:02<00:54,  1.70it/s]Train:  84%|████████▍ | 485/576 [07:03<00:51,  1.75it/s]Train:  84%|████████▍ | 486/576 [07:03<00:51,  1.76it/s]Train:  85%|████████▍ | 487/576 [07:04<00:49,  1.80it/s]Train:  85%|████████▍ | 488/576 [07:04<00:47,  1.85it/s]Train:  85%|████████▍ | 489/576 [07:05<00:50,  1.71it/s]Train:  85%|████████▌ | 490/576 [07:05<00:48,  1.76it/s]                                                        {'loss': 0.92182655, 'token_acc': 0.76510849, 'grad_norm': 2.37514949, 'learning_rate': 1.1e-06, 'memory(GiB)': 61.65, 'train_speed(iter/s)': 1.111883, 'epoch': 0.85, 'global_step/max_steps': '490/576', 'percentage': '85.07%', 'elapsed_time': '7m 5s', 'remaining_time': '1m 14s'}
+Train:  85%|████████▌ | 490/576 [07:05<00:48,  1.76it/s]Train:  85%|████████▌ | 490/576 [07:05<00:48,  1.76it/s]Train:  85%|████████▌ | 491/576 [07:06<00:51,  1.64it/s]Train:  85%|████████▌ | 492/576 [07:07<00:47,  1.75it/s]Train:  86%|████████▌ | 493/576 [07:07<00:46,  1.80it/s]Train:  86%|████████▌ | 494/576 [07:08<00:45,  1.79it/s]Train:  86%|████████▌ | 495/576 [07:08<00:46,  1.75it/s]Train:  86%|████████▌ | 496/576 [07:09<00:46,  1.71it/s]Train:  86%|████████▋ | 497/576 [07:09<00:44,  1.77it/s]Train:  86%|████████▋ | 498/576 [07:10<00:43,  1.80it/s]Train:  87%|████████▋ | 499/576 [07:11<00:49,  1.55it/s]Train:  87%|████████▋ | 500/576 [07:11<00:47,  1.60it/s]                                                        {'loss': 0.87140045, 'token_acc': 0.77599027, 'grad_norm': 2.14369798, 'learning_rate': 8.6e-07, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.119386, 'epoch': 0.87, 'global_step/max_steps': '500/576', 'percentage': '86.81%', 'elapsed_time': '7m 11s', 'remaining_time': '1m 5s'}
+Train:  87%|████████▋ | 500/576 [07:11<00:47,  1.60it/s]Train:  87%|████████▋ | 500/576 [07:11<00:47,  1.60it/s]Train:  87%|████████▋ | 501/576 [07:12<00:44,  1.70it/s]Train:  87%|████████▋ | 502/576 [07:12<00:43,  1.70it/s]Train:  87%|████████▋ | 503/576 [07:13<00:41,  1.75it/s]Train:  88%|████████▊ | 504/576 [07:13<00:39,  1.82it/s]Train:  88%|████████▊ | 505/576 [07:14<00:38,  1.86it/s]Train:  88%|████████▊ | 506/576 [07:15<00:37,  1.87it/s]Train:  88%|████████▊ | 507/576 [07:15<00:38,  1.78it/s]Train:  88%|████████▊ | 508/576 [07:16<00:38,  1.76it/s]Train:  88%|████████▊ | 509/576 [07:16<00:37,  1.79it/s]Train:  89%|████████▊ | 510/576 [07:17<00:37,  1.78it/s]                                                        {'loss': 0.93843174, 'token_acc': 0.77577674, 'grad_norm': 2.25089169, 'learning_rate': 6.5e-07, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.127934, 'epoch': 0.89, 'global_step/max_steps': '510/576', 'percentage': '88.54%', 'elapsed_time': '7m 17s', 'remaining_time': '56s'}
+Train:  89%|████████▊ | 510/576 [07:17<00:37,  1.78it/s]Train:  89%|████████▊ | 510/576 [07:17<00:37,  1.78it/s]Train:  89%|████████▊ | 511/576 [07:17<00:35,  1.81it/s]Train:  89%|████████▉ | 512/576 [07:18<00:41,  1.54it/s]Train:  89%|████████▉ | 513/576 [07:19<00:38,  1.63it/s]Train:  89%|████████▉ | 514/576 [07:20<00:41,  1.51it/s]Train:  89%|████████▉ | 515/576 [07:20<00:39,  1.56it/s]Train:  90%|████████▉ | 516/576 [07:21<00:38,  1.57it/s]Train:  90%|████████▉ | 517/576 [07:21<00:35,  1.66it/s]Train:  90%|████████▉ | 518/576 [07:22<00:33,  1.73it/s]Train:  90%|█████████ | 519/576 [07:22<00:32,  1.78it/s]Train:  90%|█████████ | 520/576 [07:23<00:32,  1.72it/s]                                                        {'loss': 0.91953583, 'token_acc': 0.77256221, 'grad_norm': 2.13122702, 'learning_rate': 4.7e-07, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.134679, 'epoch': 0.9, 'global_step/max_steps': '520/576', 'percentage': '90.28%', 'elapsed_time': '7m 23s', 'remaining_time': '47s'}
+Train:  90%|█████████ | 520/576 [07:23<00:32,  1.72it/s]Train:  90%|█████████ | 520/576 [07:23<00:32,  1.72it/s]Train:  90%|█████████ | 521/576 [07:23<00:30,  1.77it/s]Train:  91%|█████████ | 522/576 [07:24<00:33,  1.60it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.57it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.44it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.50it/s]                                                                                                          {'eval_loss': 0.86823428, 'eval_token_acc': 0.77888283, 'eval_runtime': 1.3472, 'eval_samples_per_second': 371.146, 'eval_steps_per_second': 4.454, 'epoch': 0.91, 'global_step/max_steps': '522/576', 'percentage': '90.62%', 'elapsed_time': '7m 26s', 'remaining_time': '46s'}
+Train:  91%|█████████ | 522/576 [07:26<00:33,  1.60it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.50it/s]Train:  91%|█████████ | 522/576 [07:26<00:33,  1.60it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.25it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-522
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  91%|█████████ | 523/576 [07:43<05:24,  6.11s/it]Train:  91%|█████████ | 524/576 [07:44<03:51,  4.44s/it]Train:  91%|█████████ | 525/576 [07:44<02:48,  3.30s/it]Train:  91%|█████████▏| 526/576 [07:45<02:03,  2.47s/it]Train:  91%|█████████▏| 527/576 [07:45<01:31,  1.87s/it]Train:  92%|█████████▏| 528/576 [07:46<01:11,  1.50s/it]Train:  92%|█████████▏| 529/576 [07:47<00:58,  1.25s/it]Train:  92%|█████████▏| 530/576 [07:47<00:46,  1.01s/it]                                                        {'loss': 0.92105646, 'token_acc': 0.75577157, 'grad_norm': 2.31223822, 'learning_rate': 3.2e-07, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.09865, 'epoch': 0.92, 'global_step/max_steps': '530/576', 'percentage': '92.01%', 'elapsed_time': '7m 47s', 'remaining_time': '40s'}
+Train:  92%|█████████▏| 530/576 [07:47<00:46,  1.01s/it]Train:  92%|█████████▏| 530/576 [07:47<00:46,  1.01s/it]Train:  92%|█████████▏| 531/576 [07:48<00:39,  1.15it/s]Train:  92%|█████████▏| 532/576 [07:48<00:35,  1.25it/s]Train:  93%|█████████▎| 533/576 [07:49<00:31,  1.36it/s]Train:  93%|█████████▎| 534/576 [07:49<00:27,  1.52it/s]Train:  93%|█████████▎| 535/576 [07:50<00:25,  1.61it/s]Train:  93%|█████████▎| 536/576 [07:50<00:24,  1.66it/s]Train:  93%|█████████▎| 537/576 [07:51<00:23,  1.67it/s]Train:  93%|█████████▎| 538/576 [07:52<00:22,  1.69it/s]Train:  94%|█████████▎| 539/576 [07:52<00:21,  1.75it/s]Train:  94%|█████████▍| 540/576 [07:53<00:20,  1.79it/s]                                                        {'loss': 0.93533497, 'token_acc': 0.75490352, 'grad_norm': 2.32763433, 'learning_rate': 2e-07, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.106631, 'epoch': 0.94, 'global_step/max_steps': '540/576', 'percentage': '93.75%', 'elapsed_time': '7m 53s', 'remaining_time': '31s'}
+Train:  94%|█████████▍| 540/576 [07:53<00:20,  1.79it/s]Train:  94%|█████████▍| 540/576 [07:53<00:20,  1.79it/s]Train:  94%|█████████▍| 541/576 [07:53<00:19,  1.79it/s]Train:  94%|█████████▍| 542/576 [07:54<00:19,  1.75it/s]Train:  94%|█████████▍| 543/576 [07:54<00:18,  1.79it/s]Train:  94%|█████████▍| 544/576 [07:55<00:19,  1.68it/s]Train:  95%|█████████▍| 545/576 [07:55<00:17,  1.81it/s]Train:  95%|█████████▍| 546/576 [07:56<00:17,  1.71it/s]Train:  95%|█████████▍| 547/576 [07:57<00:17,  1.63it/s]Train:  95%|█████████▌| 548/576 [07:57<00:17,  1.61it/s]Train:  95%|█████████▌| 549/576 [07:58<00:16,  1.61it/s]Train:  95%|█████████▌| 550/576 [07:59<00:16,  1.57it/s]                                                        {'loss': 0.91859741, 'token_acc': 0.76593915, 'grad_norm': 2.30822229, 'learning_rate': 1e-07, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.113207, 'epoch': 0.95, 'global_step/max_steps': '550/576', 'percentage': '95.49%', 'elapsed_time': '7m 59s', 'remaining_time': '22s'}
+Train:  95%|█████████▌| 550/576 [07:59<00:16,  1.57it/s]Train:  95%|█████████▌| 550/576 [07:59<00:16,  1.57it/s]Train:  96%|█████████▌| 551/576 [07:59<00:15,  1.63it/s]Train:  96%|█████████▌| 552/576 [08:00<00:14,  1.65it/s]Train:  96%|█████████▌| 553/576 [08:00<00:13,  1.66it/s]Train:  96%|█████████▌| 554/576 [08:01<00:13,  1.69it/s]Train:  96%|█████████▋| 555/576 [08:02<00:12,  1.70it/s]Train:  97%|█████████▋| 556/576 [08:02<00:11,  1.76it/s]Train:  97%|█████████▋| 557/576 [08:03<00:10,  1.73it/s]Train:  97%|█████████▋| 558/576 [08:03<00:09,  1.85it/s]Train:  97%|█████████▋| 559/576 [08:04<00:09,  1.74it/s]Train:  97%|█████████▋| 560/576 [08:04<00:08,  1.82it/s]                                                        {'loss': 0.93017979, 'token_acc': 0.76844238, 'grad_norm': 2.39454794, 'learning_rate': 4e-08, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.120728, 'epoch': 0.97, 'global_step/max_steps': '560/576', 'percentage': '97.22%', 'elapsed_time': '8m 4s', 'remaining_time': '13s'}
+Train:  97%|█████████▋| 560/576 [08:04<00:08,  1.82it/s]Train:  97%|█████████▋| 560/576 [08:04<00:08,  1.82it/s]Train:  97%|█████████▋| 561/576 [08:05<00:08,  1.73it/s]Train:  98%|█████████▊| 562/576 [08:06<00:08,  1.69it/s]Train:  98%|█████████▊| 563/576 [08:06<00:07,  1.71it/s]Train:  98%|█████████▊| 564/576 [08:07<00:06,  1.73it/s]Train:  98%|█████████▊| 565/576 [08:07<00:06,  1.74it/s]Train:  98%|█████████▊| 566/576 [08:08<00:05,  1.71it/s]Train:  98%|█████████▊| 567/576 [08:09<00:05,  1.71it/s]Train:  99%|█████████▊| 568/576 [08:09<00:04,  1.75it/s]Train:  99%|█████████▉| 569/576 [08:10<00:03,  1.83it/s]Train:  99%|█████████▉| 570/576 [08:10<00:03,  1.86it/s]                                                        {'loss': 0.88816948, 'token_acc': 0.77043975, 'grad_norm': 2.22240019, 'learning_rate': 1e-08, 'memory(GiB)': 64.39, 'train_speed(iter/s)': 1.127843, 'epoch': 0.99, 'global_step/max_steps': '570/576', 'percentage': '98.96%', 'elapsed_time': '8m 10s', 'remaining_time': '5s'}
+Train:  99%|█████████▉| 570/576 [08:10<00:03,  1.86it/s]Train:  99%|█████████▉| 570/576 [08:10<00:03,  1.86it/s]Train:  99%|█████████▉| 571/576 [08:11<00:02,  1.93it/s]Train:  99%|█████████▉| 572/576 [08:11<00:02,  1.85it/s]Train:  99%|█████████▉| 573/576 [08:12<00:01,  1.83it/s]Train: 100%|█████████▉| 574/576 [08:12<00:01,  1.75it/s]Train: 100%|█████████▉| 575/576 [08:13<00:00,  1.67it/s]Train: 100%|██████████| 576/576 [08:13<00:00,  1.89it/s]
+Val:   0%|          | 0/6 [00:00<?, ?it/s]Val:  33%|███▎      | 2/6 [00:00<00:00, 16.55it/s]Val:  67%|██████▋   | 4/6 [00:00<00:00,  9.44it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.68it/s]                                                                                                          {'eval_loss': 0.86763388, 'eval_token_acc': 0.77949388, 'eval_runtime': 1.4447, 'eval_samples_per_second': 346.092, 'eval_steps_per_second': 4.153, 'epoch': 1.0, 'global_step/max_steps': '576/576', 'percentage': '100.00%', 'elapsed_time': '8m 15s', 'remaining_time': '0s'}
+Train: 100%|██████████| 576/576 [08:15<00:00,  1.89it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  6.68it/s]Train: 100%|██████████| 576/576 [08:15<00:00,  1.89it/s]Val: 100%|██████████| 6/6 [00:00<00:00,  7.42it/s]
+[INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/checkpoint-576
+                                                        {'train_runtime': 512.0705, 'train_samples_per_second': 107.911, 'train_steps_per_second': 1.125, 'train_loss': 1.02209977, 'epoch': 1.0, 'global_step/max_steps': '576/576', 'percentage': '100.00%', 'elapsed_time': '8m 32s', 'remaining_time': '0s'}
+Train: 100%|██████████| 576/576 [08:32<00:00,  1.89it/s]Train: 100%|██████████| 576/576 [08:32<00:00,  1.89it/s]Train: 100%|██████████| 576/576 [08:32<00:00,  1.12it/s]
+[INFO:swift] last_model_checkpoint: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/last
+[INFO:swift] best_model_checkpoint: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] images_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/images
+[INFO:swift] End time of running main: 2025-09-15 22:13:41.367231
++ bash inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
++ comet_model=/mnt/nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme3/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
++ lang_pair_strs=en2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=en
++ lp=zh2en
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
++ lang_pair_strs=en2zh,zh2en
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' de = zh ']'
++ src_lang=de
++ tgt_lang=zh
++ lp=de2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=de
++ lp=zh2de
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' ru = zh ']'
++ src_lang=ru
++ tgt_lang=zh
++ lp=ru2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=ru
++ lp=zh2ru
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' bn = zh ']'
++ src_lang=bn
++ tgt_lang=zh
++ lp=bn2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=bn
++ lp=zh2bn
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' hi = zh ']'
++ src_lang=hi
++ tgt_lang=zh
++ lp=hi2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=hi
++ lp=zh2hi
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' th = zh ']'
++ src_lang=th
++ tgt_lang=zh
++ lp=th2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=th
++ lp=zh2th
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' jv = zh ']'
++ src_lang=jv
++ tgt_lang=zh
++ lp=jv2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=jv
++ lp=zh2jv
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' sw = zh ']'
++ src_lang=sw
++ tgt_lang=zh
++ lp=sw2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=sw
++ lp=zh2sw
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' si = zh ']'
++ src_lang=si
++ tgt_lang=zh
++ lp=si2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=si
++ lp=zh2si
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' km = zh ']'
++ src_lang=km
++ tgt_lang=zh
++ lp=km2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=km
++ lp=zh2km
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
++ metric=bleu,comet_22
++ python /mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py --metric bleu,comet_22 --comet_22_path /mnt/nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt --xcomet_xxl_path /mnt/nvme3/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt --lang_pair en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km --src_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh --ref_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km --hypo_file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt --record_file result_mt.xlsx
+[2025-09-15 22:13:58,155] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt`
+Encoder model frozen.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+evaluate zh2en
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt -l zh-en
+
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 171, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 153, in main
+    score = bleu_scoring(ref_file, hypo_file, lp)
+  File "/mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py", line 25, in bleu_scoring
+    return float(score.stdout.strip()) 
+ValueError: could not convert string to float: ''
+++++ readlink -f inference.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/inference.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/accelerate_config.yaml
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ predict_model_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
++ comet_model=/mnt/nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt
++ xcome_model=/mnt/nvme3/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt
++ lang_pair_strs=
++ src_file_strs=
++ ref_file_strs=
++ hypo_file_strs=
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' en = zh ']'
++ src_lang=en
++ tgt_lang=zh
++ lp=en2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/generated_predictions.jsonl
+[2025-09-15 22:18:36,986] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:18:43,766] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:18:43,958] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:18:44,050] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:18:44,089] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:18:44,363] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 22:18:44,451] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:18:44,462] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:18:44,465] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.en2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.54s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.58s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.30s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:18:48.737545
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:16,  6.61it/s][A
+ 13%|█▎        | 16/125 [00:02<00:16,  6.56it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.16it/s][A
+ 13%|█▎        | 16/125 [00:02<00:20,  5.34it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.03it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.01it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.80it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.28it/s][A
+ 26%|██▌       | 32/125 [00:04<00:12,  7.16it/s][A
+ 26%|██▌       | 32/125 [00:04<00:14,  6.56it/s][A
+ 26%|██▌       | 32/125 [00:04<00:14,  6.39it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.09it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.23it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.12it/s][A
+ 38%|███▊      | 48/125 [00:06<00:10,  7.31it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.89it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.60it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.19it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.64it/s][A
+ 51%|█████     | 64/125 [00:09<00:08,  7.15it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.65it/s][A
+ 26%|██▌       | 32/125 [00:09<00:30,  3.09it/s][A
+ 38%|███▊      | 48/125 [00:09<00:17,  4.49it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.71it/s][A
+ 38%|███▊      | 48/125 [00:09<00:16,  4.79it/s][A
+ 38%|███▊      | 48/125 [00:11<00:16,  4.57it/s][A
+ 51%|█████     | 64/125 [00:11<00:11,  5.36it/s][A
+ 64%|██████▍   | 80/125 [00:11<00:06,  6.62it/s][A
+ 64%|██████▍   | 80/125 [00:11<00:06,  6.98it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  6.34it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  4.81it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.79it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.59it/s][A
+ 77%|███████▋  | 96/125 [00:14<00:04,  6.56it/s][A
+ 77%|███████▋  | 96/125 [00:14<00:04,  6.32it/s][A
+ 77%|███████▋  | 96/125 [00:14<00:04,  6.05it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:08,  5.01it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.32it/s][A
+ 51%|█████     | 64/125 [00:15<00:13,  4.50it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.24it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.64it/s][A
+ 90%|████████▉ | 112/125 [00:16<00:01,  6.73it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:01,  6.50it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.51it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:02,  6.06it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.60it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:09,  4.86it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.38it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.25it/s][A
+100%|██████████| 125/125 [00:19<00:00,  6.49it/s][A100%|██████████| 125/125 [00:19<00:00,  6.55it/s]
+
+ 90%|████████▉ | 112/125 [00:19<00:02,  6.24it/s][A
+100%|██████████| 125/125 [00:19<00:00,  6.30it/s][A100%|██████████| 125/125 [00:19<00:00,  6.45it/s]
+
+100%|██████████| 125/125 [00:20<00:00,  5.80it/s][A100%|██████████| 125/125 [00:20<00:00,  6.19it/s]
+
+100%|██████████| 125/125 [00:20<00:00,  6.63it/s][A100%|██████████| 125/125 [00:20<00:00,  6.00it/s]
+
+ 77%|███████▋  | 96/125 [00:21<00:05,  4.88it/s][A
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.15it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.56it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:06,  4.22it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.55it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.96it/s][A
+100%|██████████| 125/125 [00:25<00:00,  4.69it/s][A100%|██████████| 125/125 [00:25<00:00,  4.88it/s]
+
+100%|██████████| 125/125 [00:25<00:00,  4.41it/s][A100%|██████████| 125/125 [00:25<00:00,  4.81it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.79it/s][A100%|██████████| 125/125 [00:28<00:00,  4.44it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.26it/s][A100%|██████████| 125/125 [00:28<00:00,  4.35it/s]
+ 99%|█████████▉| 1000/1012 [00:28<00:00, 34.68it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 34.68it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 34.71it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 34.70it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 34.68it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 34.68it/s] 99%|█████████▉| 1000/1012 [00:28<00:00, 34.68it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+
+  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:29<00:00, 34.34it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.62it/s][A100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.20it/s][A100%|██████████| 1/1 [00:00<00:00,  2.20it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.81it/s][A100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.64it/s][A100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A100%|██████████| 2/2 [00:00<00:00,  2.10it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.44it/s][A100%|██████████| 2/2 [00:01<00:00,  1.44it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.24it/s][A100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.13it/s][A100%|██████████| 2/2 [00:01<00:00,  1.13it/s]
+100%|██████████| 1012/1012 [00:30<00:00, 32.03it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.02it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.02it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.04it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.02it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.02it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.77it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.77it/s]
+
+100%|██████████| 1012/1012 [00:30<00:00, 32.77it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.79it/s]
+
+100%|██████████| 1012/1012 [00:30<00:00, 32.77it/s]
+[rank1] {'num_prompt_tokens': 8644, 'num_generated_tokens': 3705, 'num_samples': 127, 'runtime': 30.882528150454164, 'samples/s': 4.112357621153251, 'tokens/s': 119.97074792419524}
+[rank6] {'num_prompt_tokens': 7602, 'num_generated_tokens': 3227, 'num_samples': 126, 'runtime': 30.883868861943483, 'samples/s': 4.079799735041064, 'tokens/s': 104.48820432521838}
+100%|██████████| 1012/1012 [00:30<00:00, 32.77it/s]
+[rank7] {'num_prompt_tokens': 7844, 'num_generated_tokens': 3080, 'num_samples': 126, 'runtime': 30.883895145729184, 'samples/s': 4.079796262921326, 'tokens/s': 99.7283530936324}
+[rank4] {'num_prompt_tokens': 8327, 'num_generated_tokens': 3220, 'num_samples': 126, 'runtime': 30.86583779938519, 'samples/s': 4.082183053605944, 'tokens/s': 104.32245581437412}
+[rank5] {'num_prompt_tokens': 8274, 'num_generated_tokens': 3210, 'num_samples': 126, 'runtime': 30.883946370333433, 'samples/s': 4.079789496106409, 'tokens/s': 103.93749430556804}
+100%|██████████| 1012/1012 [00:30<00:00, 32.04it/s][rank2] {'num_prompt_tokens': 9022, 'num_generated_tokens': 3557, 'num_samples': 127, 'runtime': 30.883565647527575, 'samples/s': 4.112219471334495, 'tokens/s': 115.1745248782425}
+100%|██████████| 1012/1012 [00:30<00:00, 32.79it/s]
+[rank3] {'num_prompt_tokens': 8911, 'num_generated_tokens': 3496, 'num_samples': 127, 'runtime': 30.864105751737952, 'samples/s': 4.114812235985442, 'tokens/s': 113.27073682681187}
+100%|██████████| 1012/1012 [00:30<00:00, 32.13it/s]100%|██████████| 1012/1012 [00:30<00:00, 32.75it/s]
+[rank0] {'num_prompt_tokens': 9270, 'num_generated_tokens': 3795, 'num_samples': 127, 'runtime': 30.89907126687467, 'samples/s': 4.110155897667716, 'tokens/s': 122.81922544605499}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:19:22.945093
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
++ lang_pair_strs=en2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=en
++ lp=zh2en
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/train.log
+[2025-09-15 22:19:55,733] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:20:02,105] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:20:02,715] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[2025-09-15 22:20:03,162] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:20:03,201] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:20:03,206] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:20:03,245] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:20:03,247] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:20:03,248] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2en.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.57s/it]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.29s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.49s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:20:07.399046
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:16,  6.50it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.36it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.11it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.16it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.09it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  4.99it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.62it/s][A
+ 13%|█▎        | 16/125 [00:04<00:30,  3.63it/s][A
+ 26%|██▌       | 32/125 [00:04<00:14,  6.54it/s][A
+ 26%|██▌       | 32/125 [00:05<00:13,  6.70it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.08it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.06it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.77it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.05it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.85it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.48it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.44it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.68it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.61it/s][A
+ 38%|███▊      | 48/125 [00:08<00:14,  5.19it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.35it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.75it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.47it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.58it/s][A
+ 51%|█████     | 64/125 [00:10<00:10,  6.09it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.37it/s][A
+ 51%|█████     | 64/125 [00:11<00:10,  5.62it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.27it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:06,  6.66it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  4.91it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.96it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.69it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:08,  5.59it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.97it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.66it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.00it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  5.91it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:09,  4.96it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.74it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.03it/s][A
+ 77%|███████▋  | 96/125 [00:16<00:05,  5.45it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.13it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.43it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:08,  5.03it/s][A
+ 90%|████████▉ | 112/125 [00:17<00:02,  6.19it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.57it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  6.13it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.18it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  5.61it/s][A
+100%|██████████| 125/125 [00:20<00:00,  5.94it/s][A100%|██████████| 125/125 [00:20<00:00,  6.16it/s]
+
+ 77%|███████▋  | 96/125 [00:20<00:05,  5.29it/s][A
+ 90%|███��████▉ | 112/125 [00:20<00:02,  5.87it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.68it/s][A
+100%|██████████| 125/125 [00:20<00:00,  5.84it/s][A100%|██████████| 125/125 [00:20<00:00,  5.98it/s]
+
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.14it/s][A
+100%|██████████| 125/125 [00:22<00:00,  5.18it/s][A100%|██████████| 125/125 [00:22<00:00,  5.61it/s]
+
+100%|██████████| 125/125 [00:22<00:00,  5.94it/s][A100%|██████████| 125/125 [00:22<00:00,  5.56it/s]
+
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.45it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.63it/s][A
+100%|██████████| 125/125 [00:24<00:00,  4.80it/s][A100%|██████████| 125/125 [00:24<00:00,  5.18it/s]
+
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.79it/s][A
+100%|██████████| 125/125 [00:26<00:00,  4.72it/s][A100%|██████████| 125/125 [00:26<00:00,  4.81it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.86it/s][A100%|██████████| 125/125 [00:26<00:00,  4.80it/s]
+
+100%|██████████| 125/125 [00:27<00:00,  4.68it/s][A100%|██████████| 125/125 [00:27<00:00,  4.58it/s]
+ 99%|█████████▉| 1000/1012 [00:27<00:00, 36.50it/s] 99%|█████████▉| 1000/1012 [00:27<00:00, 36.53it/s] 99%|█████████▉| 1000/1012 [00:27<00:00, 36.50it/s] 99%|█████████▉| 1000/1012 [00:27<00:00, 36.50it/s] 99%|█████████▉| 1000/1012 [00:27<00:00, 36.50it/s] 99%|█████████▉| 1000/1012 [00:27<00:00, 36.50it/s] 99%|█████████▉| 1000/1012 [00:27<00:00, 36.50it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:27<00:00, 36.11it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.00it/s][A100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.93it/s][A100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.59it/s][A100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.53it/s][A100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.81it/s][A100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.40it/s][A100%|██████████| 2/2 [00:01<00:00,  1.39it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.30it/s][A100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.25it/s][A100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 33.80it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.80it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.83it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.80it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.80it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.80it/s]100%|██████████| 1012/1012 [00:29<00:00, 33.80it/s]100%|██████████| 1012/1012 [00:29<00:00, 34.56it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 34.56it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 34.59it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 34.56it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 34.56it/s]
+100%|██████████| 1012/1012 [00:29<00:00, 34.56it/s][rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 3550, 'num_samples': 126, 'runtime': 29.284329127520323, 'samples/s': 4.302642531140995, 'tokens/s': 121.22524591706771}
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 3538, 'num_samples': 127, 'runtime': 29.283806785941124, 'samples/s': 4.336867844004882, 'tokens/s': 120.81762544952181}
+100%|██████████| 1012/1012 [00:29<00:00, 34.56it/s]
+
+[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 3496, 'num_samples': 127, 'runtime': 29.257180102169514, 'samples/s': 4.340814786541323, 'tokens/s': 119.49203538384619}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 3427, 'num_samples': 126, 'runtime': 29.283980736508965, 'samples/s': 4.302693719604627, 'tokens/s': 117.02643950067505}[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 3751, 'num_samples': 127, 'runtime': 29.284253353253007, 'samples/s': 4.336801709369597, 'tokens/s': 128.0893166287036}
+
+[rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 3360, 'num_samples': 126, 'runtime': 29.28299998305738, 'samples/s': 4.302837826482988, 'tokens/s': 114.74234203954636}
+[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 3471, 'num_samples': 126, 'runtime': 29.283736869692802, 'samples/s': 4.302729551241245, 'tokens/s': 118.52995454252667}
+100%|██████████| 1012/1012 [00:29<00:00, 33.92it/s]100%|██████████| 1012/1012 [00:29<00:00, 34.54it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 3813, 'num_samples': 127, 'runtime': 29.298636024817824, 'samples/s': 4.334672777682307, 'tokens/s': 130.14257717561134}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:20:40.354776
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
++ lang_pair_strs=en2zh,zh2en
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' de = zh ']'
++ src_lang=de
++ tgt_lang=zh
++ lp=de2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/train.log
+[2025-09-15 22:21:13,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:21:19,961] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:21:20,230] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:21:20,334] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 22:21:20,410] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:21:20,470] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:21:20,538] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:21:20,605] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:21:20,648] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.de2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.56s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.13s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:21:24.668496
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.27it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.77it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.30it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.18it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.79it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.61it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.26it/s][A
+ 13%|█▎        | 16/125 [00:04<00:33,  3.27it/s][A
+ 26%|██▌       | 32/125 [00:05<00:13,  6.68it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.25it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.91it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.22it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.13it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.27it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.85it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.36it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.72it/s][A
+ 26%|██▌       | 32/125 [00:09<00:26,  3.52it/s][A
+ 26%|██▌       | 32/125 [00:09<00:28,  3.21it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.18it/s][A
+ 38%|███▊      | 48/125 [00:09<00:16,  4.75it/s][A
+ 51%|█████     | 64/125 [00:09<00:09,  6.51it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.37it/s][A
+ 38%|███▊      | 48/125 [00:10<00:18,  4.17it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.47it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.50it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.99it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.76it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  6.02it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.84it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.67it/s][A
+ 38%|███▊      | 48/125 [00:14<00:23,  3.24it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.31it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:09,  4.99it/s][A
+ 64%|██████▍   | 80/125 [00:15<00:08,  5.12it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  5.83it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:05,  5.76it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.92it/s][A
+ 51%|█████     | 64/125 [00:16<00:14,  4.08it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.61it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:11,  4.07it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  6.06it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.20it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.31it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  5.58it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.20it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.49it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  6.08it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.79it/s][A
+100%|██████████| 125/125 [00:20<00:00,  5.97it/s][A100%|██████████| 125/125 [00:20<00:00,  6.07it/s]
+
+ 90%|████████▉ | 112/125 [00:21<00:02,  5.23it/s][A
+100%|██████████| 125/125 [00:21<00:00,  5.22it/s][A100%|██████████| 125/125 [00:21<00:00,  5.69it/s]
+
+100%|██████████| 125/125 [00:22<00:00,  6.37it/s][A100%|██████████| 125/125 [00:22<00:00,  5.65it/s]
+
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.58it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.92it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.36it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  4.12it/s][A
+100%|██████████| 125/125 [00:24<00:00,  5.22it/s][A100%|██████████| 125/125 [00:24<00:00,  5.19it/s]
+
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.61it/s][A
+ 90%|████████▉ | 112/125 [00:26<00:02,  4.90it/s][A
+100%|██████████| 125/125 [00:26<00:00,  4.41it/s][A100%|██████████| 125/125 [00:26<00:00,  4.70it/s]
+
+100%|██████████| 125/125 [00:27<00:00,  4.42it/s][A100%|██████████| 125/125 [00:27<00:00,  4.59it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.90it/s][A100%|██████████| 125/125 [00:28<00:00,  4.36it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.10it/s][A100%|██████████| 125/125 [00:29<00:00,  4.18it/s]
+ 99%|█████████▉| 1000/1012 [00:30<00:00, 33.33it/s] 99%|█████████▉| 1000/1012 [00:29<00:00, 33.35it/s] 99%|█████████▉| 1000/1012 [00:30<00:00, 33.33it/s] 99%|█████████▉| 1000/1012 [00:30<00:00, 33.33it/s] 99%|█████████▉| 1000/1012 [00:30<00:00, 33.33it/s]
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:30<00:00, 33.09it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:30<00:00, 33.08it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:30<00:00, 33.05it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.07it/s][A100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.02it/s][A100%|██████████| 1/1 [00:00<00:00,  2.02it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.77it/s][A100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.43it/s][A100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.27it/s][A100%|██████████| 2/2 [00:00<00:00,  2.27it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.52it/s][A100%|██████████| 2/2 [00:01<00:00,  1.52it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.46it/s][A100%|██████████| 2/2 [00:01<00:00,  1.46it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.43it/s][A100%|██████████| 2/2 [00:01<00:00,  1.43it/s]
+100%|██████████| 1012/1012 [00:31<00:00, 31.47it/s]100%|██████████| 1012/1012 [00:31<00:00, 31.56it/s]100%|██████████| 1012/1012 [00:31<00:00, 31.47it/s]100%|██████████| 1012/1012 [00:31<00:00, 31.56it/s]100%|██████████| 1012/1012 [00:31<00:00, 32.00it/s]100%|██████████| 1012/1012 [00:31<00:00, 31.47it/s]
+100%|██████████| 1012/1012 [00:31<00:00, 31.49it/s]100%|██████████| 1012/1012 [00:31<00:00, 32.00it/s]100%|██████████| 1012/1012 [00:31<00:00, 31.47it/s]
+100%|██████████| 1012/1012 [00:31<00:00, 32.00it/s]
+100%|██████████| 1012/1012 [00:31<00:00, 32.00it/s]
+[rank7] {'num_prompt_tokens': 10813, 'num_generated_tokens': 3062, 'num_samples': 126, 'runtime': 31.627324691042304, 'samples/s': 3.983896874960358, 'tokens/s': 96.81501770736996}
+[rank1] {'num_prompt_tokens': 11846, 'num_generated_tokens': 3757, 'num_samples': 127, 'runtime': 31.62556374631822, 'samples/s': 4.015738692240231, 'tokens/s': 118.79630131296494}
+100%|██████████| 1012/1012 [00:31<00:00, 32.00it/s]100%|██████████| 1012/1012 [00:31<00:00, 32.02it/s]
+
+[rank2] {'num_prompt_tokens': 12013, 'num_generated_tokens': 3613, 'num_samples': 127, 'runtime': 31.627733524888754, 'samples/s': 4.01546319783111, 'tokens/s': 114.23518530522678}
+[rank6] {'num_prompt_tokens': 10584, 'num_generated_tokens': 3292, 'num_samples': 126, 'runtime': 31.626764193177223, 'samples/s': 3.9839674786325983, 'tokens/s': 104.08905507665487}
+100%|██████████| 1012/1012 [00:31<00:00, 32.00it/s]
+[rank3] {'num_prompt_tokens': 12099, 'num_generated_tokens': 3527, 'num_samples': 127, 'runtime': 31.605437455698848, 'samples/s': 4.018295908038455, 'tokens/s': 111.59472179253252}
+[rank4] {'num_prompt_tokens': 11843, 'num_generated_tokens': 3159, 'num_samples': 126, 'runtime': 31.62796825170517, 'samples/s': 3.9838158112861684, 'tokens/s': 99.87995355438893}100%|██████████| 1012/1012 [00:31<00:00, 31.55it/s]
+[rank5] {'num_prompt_tokens': 11084, 'num_generated_tokens': 3288, 'num_samples': 126, 'runtime': 31.627619760110974, 'samples/s': 3.9838597072964776, 'tokens/s': 103.95976759992713}
+100%|██████████| 1012/1012 [00:31<00:00, 31.98it/s]
+[rank0] {'num_prompt_tokens': 12235, 'num_generated_tokens': 3879, 'num_samples': 127, 'runtime': 31.642580058425665, 'samples/s': 4.013579163440654, 'tokens/s': 122.58798090540391}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:22:00.044119
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=de
++ lp=zh2de
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/train.log
+[2025-09-15 22:22:32,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:22:39,739] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:22:39,805] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:22:39,866] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:22:39,915] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:22:40,066] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:22:40,167] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:22:40,211] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:22:40,214] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2de.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.33s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.26s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.29s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:22:44.460785
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.05it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.95it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.40it/s][A
+ 13%|█▎        | 16/125 [00:04<00:33,  3.29it/s][A
+ 13%|█▎        | 16/125 [00:05<00:34,  3.19it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.08it/s][A
+ 13%|█▎        | 16/125 [00:05<00:37,  2.93it/s][A
+ 13%|█▎        | 16/125 [00:06<00:42,  2.57it/s][A
+ 26%|██▌       | 32/125 [00:07<00:19,  4.74it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.38it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.13it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.81it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.74it/s][A
+ 26%|██▌       | 32/125 [00:09<00:26,  3.53it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.52it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.36it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.22it/s][A
+ 26%|██▌       | 32/125 [00:11<00:33,  2.74it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  4.00it/s][A
+ 26%|██▌       | 32/125 [00:13<00:39,  2.38it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.55it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.66it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.53it/s][A
+ 38%|███▊      | 48/125 [00:14<00:24,  3.17it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.16it/s][A
+ 38%|███▊      | 48/125 [00:16<00:24,  3.14it/s][A
+ 38%|███▊      | 48/125 [00:16<00:26,  2.88it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  3.95it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.22it/s][A
+ 51%|█████     | 64/125 [00:18<00:18,  3.28it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.38it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  4.08it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:11,  3.87it/s][A
+ 51%|█████     | 64/125 [00:20<00:17,  3.53it/s][A
+ 51%|█████     | 64/125 [00:20<00:18,  3.34it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:13,  3.45it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:12,  3.48it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  4.05it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:13,  3.42it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.71it/s][A
+ 64%|██████▍   | 80/125 [00:24<00:12,  3.61it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.84it/s][A
+ 64%|██████▍   | 80/125 [00:25<00:13,  3.28it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:07,  3.87it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:07,  3.69it/s][A
+ 90%|████████▉ | 112/125 [00:26<00:03,  4.30it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:03,  4.20it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:03,  3.90it/s][A
+ 77%|███████▋  | 96/125 [00:27<00:08,  3.56it/s][A
+ 77%|███████▋  | 96/125 [00:28<00:07,  3.68it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:03,  4.11it/s][A
+100%|██████████| 125/125 [00:30<00:00,  4.01it/s][A100%|██████████| 125/125 [00:30<00:00,  4.14it/s]
+
+ 77%|███████▋  | 96/125 [00:30<00:09,  3.22it/s][A
+ 90%|████████▉ | 112/125 [00:30<00:03,  3.73it/s][A
+100%|██████████| 125/125 [00:30<00:00,  4.10it/s][A100%|██████████| 125/125 [00:30<00:00,  4.07it/s]
+
+100%|██████████| 125/125 [00:31<00:00,  3.74it/s][A100%|██████████| 125/125 [00:31<00:00,  3.98it/s]
+
+100%|██████████| 125/125 [00:32<00:00,  4.06it/s][A100%|██████████| 125/125 [00:32<00:00,  3.82it/s]
+
+ 90%|████████▉ | 112/125 [00:33<00:03,  3.27it/s][A
+ 90%|████████▉ | 112/125 [00:33<00:03,  3.58it/s][A
+ 90%|████████▉ | 112/125 [00:33<00:03,  3.55it/s][A
+100%|██████████| 125/125 [00:34<00:00,  3.52it/s][A100%|██████████| 125/125 [00:34<00:00,  3.60it/s]
+
+100%|██████████| 125/125 [00:37<00:00,  3.55it/s][A100%|██████████| 125/125 [00:37<00:00,  3.34it/s]
+
+100%|██████████| 125/125 [00:37<00:00,  3.19it/s][A100%|██████████| 125/125 [00:37<00:00,  3.30it/s]
+
+100%|██████████| 125/125 [00:38<00:00,  3.32it/s][A100%|██████████| 125/125 [00:38<00:00,  3.26it/s]
+ 99%|█████████▉| 1000/1012 [00:38<00:00, 25.99it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 25.99it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.00it/s]
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.84it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.82it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.78it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.75it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.73it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:01<00:00,  1.08s/it][A100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.09s/it][A100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
+
+100%|██████████| 1/1 [00:00<00:00,  1.09it/s][A100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.17s/it][A100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
+
+100%|██████████| 2/2 [00:01<00:00,  1.03it/s][A100%|██████████| 2/2 [00:01<00:00,  1.03it/s]
+
+100%|██████████| 2/2 [00:02<00:00,  1.11s/it][A100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.09s/it][A100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.33s/it][A100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
+100%|██████████| 1012/1012 [00:41<00:00, 23.88it/s]100%|██████████| 1012/1012 [00:41<00:00, 23.86it/s]100%|██████████| 1012/1012 [00:41<00:00, 23.95it/s]100%|██████████| 1012/1012 [00:41<00:00, 23.95it/s]100%|██████████| 1012/1012 [00:41<00:00, 24.47it/s]100%|██████████| 1012/1012 [00:41<00:00, 23.94it/s]
+100%|██████████| 1012/1012 [00:41<00:00, 23.92it/s]100%|██████████| 1012/1012 [00:41<00:00, 23.86it/s]100%|██████████| 1012/1012 [00:41<00:00, 24.46it/s][rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 5132, 'num_samples': 126, 'runtime': 41.359959261491895, 'samples/s': 3.046424664090809, 'tokens/s': 124.08136012788916}
+
+100%|██████████| 1012/1012 [00:41<00:00, 24.46it/s]100%|██████████| 1012/1012 [00:41<00:00, 24.46it/s]
+
+[rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 5352, 'num_samples': 126, 'runtime': 41.381526770070195, 'samples/s': 3.04483690754328, 'tokens/s': 129.33307245374314}
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 5592, 'num_samples': 127, 'runtime': 41.38071019575, 'samples/s': 3.069062841097481, 'tokens/s': 135.13542840485917}[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 5188, 'num_samples': 127, 'runtime': 41.38092772103846, 'samples/s': 3.0690467080908865, 'tokens/s': 125.37176631161826}
+
+100%|██████████| 1012/1012 [00:41<00:00, 24.46it/s]100%|██████████| 1012/1012 [00:41<00:00, 24.46it/s]
+
+[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 5192, 'num_samples': 126, 'runtime': 41.38191152922809, 'samples/s': 3.044808597374871, 'tokens/s': 125.46544632992327}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 5402, 'num_samples': 126, 'runtime': 41.381514405831695, 'samples/s': 3.0448378172994905, 'tokens/s': 130.54138007184005}
+100%|██████████| 1012/1012 [00:41<00:00, 24.46it/s]
+100%|██████████| 1012/1012 [00:41<00:00, 23.91it/s][rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 5264, 'num_samples': 127, 'runtime': 41.38031133078039, 'samples/s': 3.069092423805235, 'tokens/s': 127.2102560544154}
+100%|██████████| 1012/1012 [00:41<00:00, 24.45it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 5814, 'num_samples': 127, 'runtime': 41.39644696936011, 'samples/s': 3.067896143212484, 'tokens/s': 140.4468360365148}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:23:29.162249
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' ru = zh ']'
++ src_lang=ru
++ tgt_lang=zh
++ lp=ru2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/train.log
+[2025-09-15 22:24:01,823] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:24:08,590] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:24:08,889] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:24:08,982] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:24:09,138] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:24:09,214] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:24:09,280] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:24:09,281] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:24:09,287] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.ru2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.86s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.51s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.51s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:24:13.424070
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.99it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.81it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.27it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.93it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.70it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.54it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.51it/s][A
+ 26%|██▌       | 32/125 [00:04<00:13,  6.77it/s][A
+ 13%|█▎        | 16/125 [00:05<00:34,  3.15it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.32it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.29it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.08it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  4.95it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.75it/s][A
+ 38%|███▊      | 48/125 [00:07<00:10,  7.03it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.42it/s][A
+ 26%|██▌       | 32/125 [00:08<00:26,  3.57it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.23it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  4.83it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.16it/s][A
+ 51%|█████     | 64/125 [00:09<00:08,  6.87it/s][A
+ 26%|██▌       | 32/125 [00:09<00:28,  3.24it/s][A
+ 51%|█████     | 64/125 [00:10<00:09,  6.32it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.53it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.27it/s][A
+ 51%|█████     | 64/125 [00:11<00:10,  5.81it/s][A
+ 38%|███▊      | 48/125 [00:12<00:18,  4.18it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  6.06it/s][A
+ 64%|██████▍   | 80/125 [00:12<00:07,  6.05it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.58it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  5.06it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.33it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:07,  5.87it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  6.13it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  6.13it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.96it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  3.95it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.06it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.68it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.35it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  5.93it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  5.81it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  5.15it/s][A
+ 90%|████████▉ | 112/125 [00:19<00:02,  5.97it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:10,  4.20it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.91it/s][A
+100%|██████████| 125/125 [00:20<00:00,  6.06it/s][A100%|██████████| 125/125 [00:20<00:00,  6.17it/s]
+
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.42it/s][A
+100%|██████████| 125/125 [00:21<00:00,  5.28it/s][A100%|██████████| 125/125 [00:21<00:00,  5.81it/s]
+
+100%|██████████| 125/125 [00:22<00:00,  5.69it/s][A100%|██████████| 125/125 [00:22<00:00,  5.62it/s]
+
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.42it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.65it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  5.10it/s][A
+100%|██████████| 125/125 [00:24<00:00,  5.74it/s][A100%|██████████| 125/125 [00:24<00:00,  5.10it/s]
+
+ 90%|████████▉ | 112/125 [00:24<00:03,  4.20it/s][A
+100%|██████████| 125/125 [00:26<00:00,  5.03it/s][A100%|██████████| 125/125 [00:26<00:00,  4.75it/s]
+
+ 90%|████████▉ | 112/125 [00:26<00:02,  4.43it/s][A
+100%|██████████| 125/125 [00:27<00:00,  4.22it/s][A100%|██████████| 125/125 [00:27<00:00,  4.51it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.51it/s][A100%|██████████| 125/125 [00:29<00:00,  4.22it/s]
+
+ 51%|█████     | 64/125 [01:42<02:27,  2.42s/it][A
+ 64%|██████▍   | 80/125 [01:46<01:13,  1.64s/it][A
+ 77%|███████▋  | 96/125 [01:48<00:32,  1.13s/it][A
+ 90%|████████▉ | 112/125 [01:51<00:10,  1.21it/s][A
+100%|██████████| 125/125 [01:54<00:00,  1.51it/s][A100%|██████████| 125/125 [01:54<00:00,  1.09it/s]
+ 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s] 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s] 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s] 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s] 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s] 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s] 99%|█████████▉| 1000/1012 [01:55<00:01,  8.69it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [01:55<00:01,  8.67it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.98it/s][A100%|██████████| 1/1 [00:00<00:00,  1.98it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.88it/s][A100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.79it/s][A100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.14it/s][A100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A100%|██████████| 2/2 [00:00<00:00,  2.09it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.61it/s][A100%|██████████| 2/2 [00:01<00:00,  1.61it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.63it/s][A100%|██████████| 2/2 [00:01<00:00,  1.63it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.24it/s][A100%|██████████| 2/2 [00:01<00:00,  1.23it/s]
+100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]
+100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]
+
+100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]
+[rank1] {'num_prompt_tokens': 13219, 'num_generated_tokens': 4650, 'num_samples': 127, 'runtime': 116.66818867437541, 'samples/s': 1.0885572274929287, 'tokens/s': 39.85662289639463}[rank2] {'num_prompt_tokens': 13889, 'num_generated_tokens': 3504, 'num_samples': 127, 'runtime': 116.67059635929763, 'samples/s': 1.0885347633682443, 'tokens/s': 30.033274101120696}
+
+100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]
+[rank6] {'num_prompt_tokens': 11855, 'num_generated_tokens': 3295, 'num_samples': 126, 'runtime': 116.66998030617833, 'samples/s': 1.0799693260368846, 'tokens/s': 28.242054994377256}
+[rank3] {'num_prompt_tokens': 13378, 'num_generated_tokens': 3501, 'num_samples': 127, 'runtime': 116.67035359144211, 'samples/s': 1.0885370283930946, 'tokens/s': 30.007623121293104}
+100%|██████████| 1012/1012 [01:56<00:00,  8.68it/s]
+[rank5] {'num_prompt_tokens': 11659, 'num_generated_tokens': 3241, 'num_samples': 126, 'runtime': 116.67080725356936, 'samples/s': 1.0799616713558415, 'tokens/s': 27.779014102097477}
+[rank7] {'num_prompt_tokens': 11527, 'num_generated_tokens': 3099, 'num_samples': 126, 'runtime': 116.64696561172605, 'samples/s': 1.0801824062822747, 'tokens/s': 26.56734346879976}
+100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]
+100%|██████████| 1012/1012 [01:56<00:00,  8.67it/s]
+[rank4] {'num_prompt_tokens': 12648, 'num_generated_tokens': 3223, 'num_samples': 126, 'runtime': 116.66730915568769, 'samples/s': 1.079994052420102, 'tokens/s': 27.625562150396732}
+[rank0] {'num_prompt_tokens': 15095, 'num_generated_tokens': 3783, 'num_samples': 127, 'runtime': 116.6857275404036, 'samples/s': 1.0883936080016725, 'tokens/s': 32.4204174729947}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:26:13.861596
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=ru
++ lp=zh2ru
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/train.log
+[2025-09-15 22:26:46,547] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:26:53,286] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:26:53,476] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:26:53,632] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:26:53,941] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:26:54,024] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:26:54,037] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:26:54,038] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:26:54,042] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2ru.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.45s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.38s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.40s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:26:57.903215
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.72it/s][A
+ 13%|█▎        | 16/125 [00:04<00:30,  3.62it/s][A
+ 13%|█▎        | 16/125 [00:04<00:30,  3.58it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.37it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.09it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.06it/s][A
+ 13%|█▎        | 16/125 [00:06<00:44,  2.42it/s][A
+ 13%|█▎        | 16/125 [00:07<00:50,  2.15it/s][A
+ 26%|██▌       | 32/125 [00:08<00:22,  4.17it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.78it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.66it/s][A
+ 26%|██▌       | 32/125 [00:08<00:26,  3.57it/s][A
+ 26%|██▌       | 32/125 [00:09<00:28,  3.27it/s][A
+ 26%|██▌       | 32/125 [00:10<00:30,  3.05it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.14it/s][A
+ 26%|██▌       | 32/125 [00:12<00:36,  2.55it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.95it/s][A
+ 38%|███▊      | 48/125 [00:12<00:19,  3.86it/s][A
+ 38%|███▊      | 48/125 [00:15<00:25,  3.07it/s][A
+ 38%|███▊      | 48/125 [00:15<00:23,  3.24it/s][A
+ 38%|███▊      | 48/125 [00:15<00:26,  2.93it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.24it/s][A
+ 26%|██▌       | 32/125 [00:15<00:46,  2.01it/s][A
+ 51%|█████     | 64/125 [00:16<00:14,  4.07it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.63it/s][A
+ 38%|███▊      | 48/125 [00:17<00:28,  2.73it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.26it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.22it/s][A
+ 38%|███▊      | 48/125 [00:19<00:29,  2.59it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:12,  3.72it/s][A
+ 51%|█████     | 64/125 [00:20<00:20,  2.99it/s][A
+ 51%|█████     | 64/125 [00:21<00:19,  3.08it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:13,  3.43it/s][A
+ 51%|█████     | 64/125 [00:22<00:22,  2.75it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.87it/s][A
+ 51%|█████     | 64/125 [00:24<00:21,  2.83it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.67it/s][A
+ 64%|██████▍   | 80/125 [00:25<00:15,  2.99it/s][A
+ 64%|██████▍   | 80/125 [00:26<00:14,  3.02it/s][A
+ 64%|██████▍   | 80/125 [00:27<00:14,  3.06it/s][A
+ 77%|███████▋  | 96/125 [00:27<00:08,  3.36it/s][A
+ 64%|██████▍   | 80/125 [00:28<00:16,  2.79it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:03,  3.83it/s][A
+ 64%|██████▍   | 80/125 [00:29<00:15,  2.92it/s][A
+ 77%|███████▋  | 96/125 [00:29<00:08,  3.24it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:03,  3.54it/s][A
+ 77%|███████▋  | 96/125 [00:31<00:09,  3.07it/s][A
+ 77%|███████▋  | 96/125 [00:31<00:08,  3.26it/s][A
+ 90%|████████▉ | 112/125 [00:31<00:03,  3.46it/s][A
+100%|██████████| 125/125 [00:33<00:00,  3.48it/s][A100%|██████████| 125/125 [00:33<00:00,  3.71it/s]
+
+ 77%|███████▋  | 96/125 [00:33<00:09,  3.20it/s][A
+ 77%|███████▋  | 96/125 [00:34<00:10,  2.67it/s][A
+100%|██████████| 125/125 [00:34<00:00,  3.28it/s][A100%|██████████| 125/125 [00:34<00:00,  3.61it/s]
+
+ 90%|████████▉ | 112/125 [00:34<00:04,  3.23it/s][A
+100%|██████████| 125/125 [00:35<00:00,  3.58it/s][A100%|██████████| 125/125 [00:35<00:00,  3.56it/s]
+
+ 90%|████████▉ | 112/125 [00:35<00:03,  3.58it/s][A
+ 90%|████████▉ | 112/125 [00:38<00:04,  3.08it/s][A
+ 90%|████████▉ | 112/125 [00:38<00:04,  2.76it/s][A
+100%|██████████| 125/125 [00:38<00:00,  3.64it/s][A100%|██████████| 125/125 [00:38<00:00,  3.25it/s]
+
+100%|██████████| 125/125 [00:39<00:00,  3.09it/s][A100%|██████████| 125/125 [00:39<00:00,  3.16it/s]
+
+ 90%|████████▉ | 112/125 [00:39<00:04,  3.03it/s][A
+100%|██████████| 125/125 [00:42<00:00,  3.07it/s][A100%|██████████| 125/125 [00:42<00:00,  2.95it/s]
+
+100%|██████████| 125/125 [00:43<00:00,  2.69it/s][A100%|██████████| 125/125 [00:43<00:00,  2.89it/s]
+
+100%|██████████| 125/125 [00:44<00:00,  2.92it/s][A100%|██████████| 125/125 [00:44<00:00,  2.80it/s]
+ 99%|█████████▉| 1000/1012 [00:44<00:00, 22.36it/s] 99%|█████████▉| 1000/1012 [00:44<00:00, 22.35it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:44<00:00, 22.24it/s] 99%|█████████▉| 1000/1012 [00:44<00:00, 22.25it/s]
+ 99%|█████████▉| 1000/1012 [00:44<00:00, 22.24it/s]  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:44<00:00, 22.24it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:45<00:00, 22.18it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:45<00:00, 22.15it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.11it/s][A100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.26s/it][A100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
+
+100%|██████████| 1/1 [00:00<00:00,  1.09it/s][A100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.07s/it][A100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.02s/it][A100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.12s/it][A100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.34s/it][A100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.46s/it][A100%|██████████| 2/2 [00:02<00:00,  1.46s/it]
+100%|██████████| 1012/1012 [00:48<00:00, 20.56it/s]100%|██████████| 1012/1012 [00:48<00:00, 20.60it/s]100%|██████████| 1012/1012 [00:48<00:00, 20.61it/s]100%|██████████| 1012/1012 [00:48<00:00, 20.60it/s]100%|██████████| 1012/1012 [00:48<00:00, 20.60it/s]100%|██████████| 1012/1012 [00:48<00:00, 20.57it/s]100%|██████████| 1012/1012 [00:48<00:00, 21.06it/s]
+100%|██████████| 1012/1012 [00:48<00:00, 20.63it/s]100%|████��█████| 1012/1012 [00:48<00:00, 21.06it/s]
+[rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 5594, 'num_samples': 126, 'runtime': 48.05021145567298, 'samples/s': 2.6222569304660985, 'tokens/s': 116.4198830875187}
+100%|██████████| 1012/1012 [00:48<00:00, 21.07it/s]100%|██████████| 1012/1012 [00:48<00:00, 21.06it/s]100%|██████████| 1012/1012 [00:48<00:00, 21.06it/s]
+
+
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 6078, 'num_samples': 127, 'runtime': 48.04965185187757, 'samples/s': 2.64309927554735, 'tokens/s': 126.49415273052594}
+100%|██████████| 1012/1012 [00:48<00:00, 21.07it/s]
+[rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 6034, 'num_samples': 126, 'runtime': 48.0322735439986, 'samples/s': 2.6232362264630527, 'tokens/s': 125.6238681783973}[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 5770, 'num_samples': 126, 'runtime': 48.05014795437455, 'samples/s': 2.6222603959438753, 'tokens/s': 120.08287686187428}[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 6069, 'num_samples': 127, 'runtime': 48.04980396851897, 'samples/s': 2.6430909079921996, 'tokens/s': 126.30644661893432}
+
+
+100%|██████████| 1012/1012 [00:48<00:00, 21.06it/s]
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 6193, 'num_samples': 127, 'runtime': 48.030539175495505, 'samples/s': 2.6441510376546757, 'tokens/s': 128.9387982377591}
+[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 5757, 'num_samples': 126, 'runtime': 48.04507370479405, 'samples/s': 2.6225373442902518, 'tokens/s': 119.82498008792841}
+100%|██████████| 1012/1012 [00:48<00:00, 20.62it/s]100%|██████████| 1012/1012 [00:48<00:00, 21.06it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 6454, 'num_samples': 127, 'runtime': 48.064477540552616, 'samples/s': 2.6422840005459016, 'tokens/s': 134.27796015372635}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:27:49.565546
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' bn = zh ']'
++ src_lang=bn
++ tgt_lang=zh
++ lp=bn2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/train.log
+[2025-09-15 22:28:22,596] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:28:29,519] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:28:29,678] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:28:29,745] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:28:29,882] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:28:29,961] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:28:29,975] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:28:30,004] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:28:30,019] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.bn2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.31s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.22s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.33s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.34s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.22s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:28:34.024242
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.02it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.48it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.44it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.37it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.28it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.76it/s][A
+ 13%|█▎        | 16/125 [00:04<00:30,  3.59it/s][A
+ 13%|█▎        | 16/125 [00:06<00:42,  2.59it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.06it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.69it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.44it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.27it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.09it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.84it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.32it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.14it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.75it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.38it/s][A
+ 26%|██▌       | 32/125 [00:11<00:33,  2.74it/s][A
+ 26%|██▌       | 32/125 [00:12<00:37,  2.49it/s][A
+ 51%|█████     | 64/125 [00:12<00:11,  5.27it/s][A
+ 38%|███▊      | 48/125 [00:12<00:21,  3.65it/s][A
+ 51%|█████     | 64/125 [00:12<00:12,  5.06it/s][A
+ 38%|███▊      | 48/125 [00:13<00:23,  3.24it/s][A
+ 51%|█████     | 64/125 [00:14<00:12,  4.83it/s][A
+ 38%|███▊      | 48/125 [00:15<00:22,  3.39it/s][A
+ 38%|███▊      | 48/125 [00:16<00:26,  2.96it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:08,  5.16it/s][A
+ 51%|█████     | 64/125 [00:16<00:17,  3.57it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:10,  4.44it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.68it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.66it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.25it/s][A
+ 51%|█████     | 64/125 [00:19<00:16,  3.61it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.57it/s][A
+ 51%|█████     | 64/125 [00:20<00:18,  3.28it/s][A
+ 64%|██████▍   | 80/125 [00:20<00:11,  3.97it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.30it/s][A
+ 64%|██████▍   | 80/125 [00:21<00:12,  3.50it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:07,  4.11it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.69it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:14,  3.16it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.49it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  4.10it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  4.06it/s][A
+ 64%|██████▍   | 80/125 [00:24<00:12,  3.46it/s][A
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.47it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:07,  3.70it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:02,  4.48it/s][A
+100%|██████████| 125/125 [00:27<00:00,  4.27it/s][A100%|██████████| 125/125 [00:27<00:00,  4.57it/s]
+
+100%|██████████| 125/125 [00:27<00:00,  4.53it/s][A100%|██████████| 125/125 [00:27<00:00,  4.57it/s]
+
+ 77%|███████▋  | 96/125 [00:27<00:07,  3.65it/s][A
+ 77%|███████▋  | 96/125 [00:27<00:07,  3.78it/s][A
+100%|██████████| 125/125 [00:29<00:00,  4.91it/s][A100%|██████████| 125/125 [00:29<00:00,  4.31it/s]
+
+ 90%|████████▉ | 112/125 [00:30<00:03,  4.22it/s][A
+ 90%|████████▉ | 112/125 [00:30<00:03,  3.45it/s][A
+ 90%|████████▉ | 112/125 [00:31<00:03,  3.59it/s][A
+ 90%|████████▉ | 112/125 [00:32<00:03,  3.67it/s][A
+100%|██████████| 125/125 [00:33<00:00,  4.10it/s][A100%|██████████| 125/125 [00:33<00:00,  3.70it/s]
+
+100%|██████████| 125/125 [00:33<00:00,  3.54it/s][A100%|██████████| 125/125 [00:33<00:00,  3.69it/s]
+
+100%|██████████| 125/125 [00:35<00:00,  3.77it/s][A100%|██████████| 125/125 [00:35<00:00,  3.50it/s]
+
+100%|██████████| 125/125 [00:36<00:00,  3.12it/s][A100%|██████████| 125/125 [00:36<00:00,  3.41it/s]
+
+ 90%|████████▉ | 112/125 [02:06<00:29,  2.29s/it][A
+100%|██████████| 125/125 [02:09<00:00,  1.73s/it][A100%|██████████| 125/125 [02:09<00:00,  1.04s/it]
+ 99%|█████████▉| 1000/1012 [02:09<00:01,  7.72it/s] 99%|█████████▉| 1000/1012 [02:09<00:01,  7.72it/s] 99%|█████████▉| 1000/1012 [02:09<00:01,  7.72it/s] 99%|█████████▉| 1000/1012 [02:09<00:01,  7.72it/s] 99%|█████████▉| 1000/1012 [02:09<00:01,  7.72it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:09<00:01,  7.70it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:09<00:01,  7.70it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  3.41it/s][A100%|██████████| 1/1 [00:00<00:00,  3.41it/s]
+ 99%|█████████▉| 1000/1012 [02:09<00:01,  7.70it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.55it/s][A100%|██████████| 1/1 [00:00<00:00,  1.55it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.53it/s][A100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.51it/s][A100%|██████████| 1/1 [00:00<00:00,  1.51it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.22it/s][A100%|██████████| 2/2 [00:01<00:00,  1.22it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.40it/s][A100%|██████████| 2/2 [00:01<00:00,  1.40it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.19it/s][A100%|██████████| 2/2 [00:01<00:00,  1.19it/s]
+100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]
+100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]
+100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]
+
+[rank1] {'num_prompt_tokens': 34238, 'num_generated_tokens': 3777, 'num_samples': 127, 'runtime': 131.60962335020304, 'samples/s': 0.9649750281714796, 'tokens/s': 28.69850930239117}[rank2] {'num_prompt_tokens': 33610, 'num_generated_tokens': 3614, 'num_samples': 127, 'runtime': 131.61497433669865, 'samples/s': 0.9649357957940821, 'tokens/s': 27.45888162204577}
+
+
+100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]
+
+[rank7] {'num_prompt_tokens': 29724, 'num_generated_tokens': 3160, 'num_samples': 126, 'runtime': 131.59272440336645, 'samples/s': 0.9574997445434501, 'tokens/s': 24.013485656803987}[rank5] {'num_prompt_tokens': 31313, 'num_generated_tokens': 3241, 'num_samples': 126, 'runtime': 131.61125439591706, 'samples/s': 0.9573649349239002, 'tokens/s': 24.625553603875876}
+
+[rank6] {'num_prompt_tokens': 29317, 'num_generated_tokens': 4307, 'num_samples': 126, 'runtime': 131.61212036572397, 'samples/s': 0.9573586357386463, 'tokens/s': 32.72494955655833}[rank3] {'num_prompt_tokens': 33748, 'num_generated_tokens': 3595, 'num_samples': 127, 'runtime': 131.6148917209357, 'samples/s': 0.9649364014923121, 'tokens/s': 27.314538294211513}
+
+[rank4] {'num_prompt_tokens': 32349, 'num_generated_tokens': 3239, 'num_samples': 126, 'runtime': 131.6148448791355, 'samples/s': 0.9573388177884363, 'tokens/s': 24.609685958863057}
+100%|██████████| 1012/1012 [02:11<00:00,  7.68it/s]100%|██████████| 1012/1012 [02:11<00:00,  7.69it/s]
+[rank0] {'num_prompt_tokens': 34392, 'num_generated_tokens': 3788, 'num_samples': 127, 'runtime': 131.63019154779613, 'samples/s': 0.9648242436377914, 'tokens/s': 28.777592400787036}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:30:49.354474
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=bn
++ lp=zh2bn
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/train.log
+[2025-09-15 22:31:22,366] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:31:29,328] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:31:29,426] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:31:29,447] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:31:29,534] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:31:29,535] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:31:29,831] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:31:29,853] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:31:29,854] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2bn.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.06s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.00s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.61s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.54s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.49s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.11it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:31:34.190395
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]
+
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:12<01:28,  1.23it/s][A
+ 13%|█▎        | 16/125 [00:14<01:35,  1.14it/s][A
+ 13%|█▎        | 16/125 [00:19<02:11,  1.20s/it][A
+ 13%|█▎        | 16/125 [00:19<02:15,  1.24s/it][A
+ 26%|██▌       | 32/125 [00:24<01:10,  1.32it/s][A
+ 26%|██▌       | 32/125 [00:29<01:26,  1.07it/s][A
+ 26%|██▌       | 32/125 [00:31<01:28,  1.05it/s][A
+ 38%|███▊      | 48/125 [00:37<00:58,  1.32it/s][A
+ 26%|██▌       | 32/125 [00:37<01:49,  1.18s/it][A
+ 38%|███▊      | 48/125 [00:49<01:14,  1.03it/s][A
+ 38%|███▊      | 48/125 [00:49<01:18,  1.03s/it][A
+ 51%|█████     | 64/125 [01:12<01:12,  1.19s/it][A
+ 13%|█▎        | 16/125 [01:31<10:23,  5.72s/it][A
+ 13%|█▎        | 16/125 [01:31<10:25,  5.74s/it][A
+ 13%|█▎        | 16/125 [01:33<10:39,  5.87s/it][A
+ 13%|█▎        | 16/125 [01:34<10:41,  5.89s/it][A
+ 26%|██▌       | 32/125 [01:44<04:21,  2.82s/it][A
+ 26%|██▌       | 32/125 [01:48<04:33,  2.94s/it][A
+ 38%|███▊      | 48/125 [02:03<04:05,  3.19s/it][A
+ 38%|███▊      | 48/125 [02:04<02:42,  2.11s/it][A
+ 51%|█████     | 64/125 [02:08<02:45,  2.72s/it][A
+ 51%|█████     | 64/125 [02:19<02:22,  2.33s/it][A
+ 51%|█████     | 64/125 [02:23<02:57,  2.91s/it][A
+ 64%|██████▍   | 80/125 [02:33<01:21,  1.81s/it][A
+ 64%|██████▍   | 80/125 [02:38<01:39,  2.20s/it][A
+ 64%|██████▍   | 80/125 [02:44<02:07,  2.83s/it][A
+ 77%|███████▋  | 96/125 [02:44<00:41,  1.43s/it][A
+ 77%|███████▋  | 96/125 [02:53<00:51,  1.77s/it][A
+ 26%|██▌       | 32/125 [03:01<08:45,  5.65s/it][A
+ 90%|████████▉ | 112/125 [03:09<00:19,  1.51s/it][A
+ 26%|██▌       | 32/125 [03:09<09:12,  5.94s/it][A
+ 38%|███▊      | 48/125 [03:12<04:21,  3.40s/it][A
+100%|██████████| 125/125 [03:20<00:00,  1.34s/it][A100%|██████████| 125/125 [03:20<00:00,  1.61s/it]
+
+ 38%|███▊      | 48/125 [03:21<05:27,  4.26s/it][A
+ 51%|█████     | 64/125 [03:29<02:31,  2.48s/it][A
+ 51%|█████     | 64/125 [03:35<03:35,  3.53s/it][A
+ 64%|██████▍   | 80/125 [03:39<02:50,  3.79s/it][A
+ 77%|███████▋  | 96/125 [04:13<01:49,  3.77s/it][A
+ 90%|████████▉ | 112/125 [04:18<00:37,  2.89s/it][A
+ 90%|████████▉ | 112/125 [04:24<00:35,  2.75s/it][A
+100%|██████████| 125/125 [04:33<00:00,  2.41s/it][A100%|██████████| 125/125 [04:33<00:00,  2.19s/it]
+
+100%|██████████| 125/125 [04:35<00:00,  2.22s/it][A100%|██████████| 125/125 [04:35<00:00,  2.20s/it]
+
+ 38%|███▊      | 48/125 [04:42<07:33,  5.88s/it][A
+ 51%|█████     | 64/125 [04:54<04:56,  4.87s/it][A
+ 51%|█████     | 64/125 [04:56<03:57,  3.90s/it][A
+ 64%|██████▍   | 80/125 [05:00<02:43,  3.64s/it][A
+ 64%|██████▍   | 80/125 [05:09<03:16,  4.36s/it][A
+ 77%|███████▋  | 96/125 [05:11<02:09,  4.45s/it][A
+ 64%|██████▍   | 80/125 [05:11<02:07,  2.83s/it][A
+ 64%|██████▍   | 80/125 [05:13<02:39,  3.55s/it][A
+ 77%|███████▋  | 96/125 [05:18<01:20,  2.77s/it][A
+ 77%|███████▋  | 96/125 [05:21<01:31,  3.14s/it][A
+ 90%|████████▉ | 112/125 [05:27<00:43,  3.32s/it][A
+ 77%|███████▋  | 96/125 [05:27<01:03,  2.20s/it][A
+ 90%|████████▉ | 112/125 [05:34<00:31,  2.39s/it][A
+100%|██████████| 125/125 [05:39<00:00,  2.68s/it][A100%|██████████| 125/125 [05:39<00:00,  2.72s/it]
+
+ 90%|████████▉ | 112/125 [05:49<00:25,  1.95s/it][A
+100%|██████████| 125/125 [06:05<00:00,  1.75s/it][A100%|██████████| 125/125 [06:05<00:00,  2.93s/it]
+
+ 77%|███████▋  | 96/125 [06:47<02:05,  4.33s/it][A
+ 90%|████████▉ | 112/125 [06:49<00:48,  3.73s/it][A
+100%|██████████| 125/125 [06:55<00:00,  3.44s/it][A100%|██████████| 125/125 [06:55<00:00,  3.32s/it]
+
+100%|██████████| 125/125 [07:00<00:00,  2.94s/it][A100%|██████████| 125/125 [07:00<00:00,  3.37s/it]
+
+ 90%|████████▉ | 112/125 [07:02<00:41,  3.23s/it][A
+100%|██████████| 125/125 [07:15<00:00,  2.61s/it][A100%|██████████| 125/125 [07:15<00:00,  3.48s/it]
+ 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s] 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s] 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s] 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s] 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s] 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s] 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [07:15<00:05,  2.30it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:02<00:00,  2.66s/it][A100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.06s/it][A100%|██████████| 1/1 [00:03<00:00,  3.06s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.32s/it][A100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.03s/it][A100%|██████████| 1/1 [00:04<00:00,  4.03s/it]
+
+100%|██████████| 2/2 [00:05<00:00,  2.52s/it][A100%|██████████| 2/2 [00:05<00:00,  2.52s/it]
+
+100%|██████████| 2/2 [00:06<00:00,  3.39s/it][A100%|██████████| 2/2 [00:06<00:00,  3.39s/it]
+
+100%|██████████| 2/2 [00:08<00:00,  4.37s/it][A100%|██████████| 2/2 [00:08<00:00,  4.37s/it]
+
+100%|██████████| 2/2 [00:09<00:00,  4.91s/it][A100%|██████████| 2/2 [00:09<00:00,  4.91s/it]
+100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]
+100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]
+100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]
+[rank1] {'num_prompt_tokens': 10543, 'num_generated_tokens': 22549, 'num_samples': 127, 'runtime': 445.13155778869987, 'samples/s': 0.28530891099005345, 'tokens/s': 50.656934125312716}
+
+100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]
+[rank5] {'num_prompt_tokens': 9025, 'num_generated_tokens': 21621, 'num_samples': 126, 'runtime': 445.1326715312898, 'samples/s': 0.28306167589665016, 'tokens/s': 48.572035671122805}
+[rank2] {'num_prompt_tokens': 10036, 'num_generated_tokens': 20575, 'num_samples': 127, 'runtime': 445.1320131383836, 'samples/s': 0.28530861913209093, 'tokens/s': 46.22224282395883}
+100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s][rank4] {'num_prompt_tokens': 8829, 'num_generated_tokens': 19594, 'num_samples': 126, 'runtime': 445.1328128874302, 'samples/s': 0.28306158600773423, 'tokens/s': 44.018323144726544}
+
+[rank7] {'num_prompt_tokens': 8723, 'num_generated_tokens': 20381, 'num_samples': 126, 'runtime': 445.13262835517526, 'samples/s': 0.2830617033525197, 'tokens/s': 45.78635377799765}
+[rank6] {'num_prompt_tokens': 9027, 'num_generated_tokens': 22027, 'num_samples': 126, 'runtime': 445.12897697649896, 'samples/s': 0.2830640252985649, 'tokens/s': 49.484534009932446}
+100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]
+100%|██████████| 1012/1012 [07:25<00:00,  2.26it/s]100%|██████████| 1012/1012 [07:25<00:00,  2.27it/s]
+[rank3] {'num_prompt_tokens': 10346, 'num_generated_tokens': 19390, 'num_samples': 127, 'runtime': 445.1318674404174, 'samples/s': 0.28530871251764384, 'tokens/s': 43.560125478087514}
+[rank0] {'num_prompt_tokens': 10826, 'num_generated_tokens': 19366, 'num_samples': 127, 'runtime': 445.14760736748576, 'samples/s': 0.2852986243171174, 'tokens/s': 43.504670539569254}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:39:02.804497
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' hi = zh ']'
++ src_lang=hi
++ tgt_lang=zh
++ lp=hi2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/train.log
+[2025-09-15 22:39:36,089] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:39:42,865] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:39:42,915] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:39:43,151] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:39:43,283] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:39:43,300] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 22:39:43,381] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:39:43,429] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:39:43,481] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.hi2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.12s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.93s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.23s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.51s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.51s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:39:47.500143
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:20,  5.34it/s][A
+ 13%|█▎        | 16/125 [00:03<00:20,  5.26it/s][A
+ 13%|█▎        | 16/125 [00:03<00:24,  4.37it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.13it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.09it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.91it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.75it/s][A
+ 13%|█▎        | 16/125 [00:06<00:41,  2.63it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.12it/s][A
+ 26%|██▌       | 32/125 [00:06<00:17,  5.24it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.81it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  4.95it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.80it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.55it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.39it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.22it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.72it/s][A
+ 26%|██▌       | 32/125 [00:10<00:30,  3.05it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.36it/s][A
+ 38%|███▊      | 48/125 [00:11<00:17,  4.35it/s][A
+ 26%|██▌       | 32/125 [00:11<00:33,  2.79it/s][A
+ 38%|███▊      | 48/125 [00:12<00:21,  3.55it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.79it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.78it/s][A
+ 51%|█████     | 64/125 [00:13<00:12,  4.73it/s][A
+ 38%|███▊      | 48/125 [00:13<00:21,  3.65it/s][A
+ 38%|███▊      | 48/125 [00:14<00:21,  3.63it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.19it/s][A
+ 51%|█████     | 64/125 [00:15<00:15,  3.92it/s][A
+ 51%|█████     | 64/125 [00:15<00:15,  4.02it/s][A
+ 51%|█████     | 64/125 [00:16<00:13,  4.39it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.81it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.32it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.39it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.47it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  4.01it/s][A
+ 51%|█████     | 64/125 [00:19<00:18,  3.32it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.31it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.63it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.36it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.44it/s][A
+ 64%|██████▍   | 80/125 [00:21<00:12,  3.48it/s][A
+ 77%|███████▋  | 96/125 [00:21<00:06,  4.59it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:06,  4.25it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  4.69it/s][A
+ 90%|████████▉ | 112/125 [00:23<00:02,  5.13it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  3.98it/s][A
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.41it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.72it/s][A
+ 64%|██████▍   | 80/125 [00:25<00:14,  3.14it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:03,  4.19it/s][A
+100%|██████████| 125/125 [00:26<00:00,  5.39it/s][A100%|██████████| 125/125 [00:26<00:00,  4.81it/s]
+
+100%|██████████| 125/125 [00:26<00:00,  4.71it/s][A100%|██████████| 125/125 [00:26<00:00,  4.71it/s]
+
+ 90%|████████▉ | 112/125 [00:27<00:03,  4.27it/s][A
+100%|██████████| 125/125 [00:28<00:00,  4.21it/s][A100%|██████████| 125/125 [00:28<00:00,  4.45it/s]
+
+100%|██████████| 125/125 [00:28<00:00,  4.37it/s][A100%|██████████| 125/125 [00:28<00:00,  4.45it/s]
+
+ 77%|███████▋  | 96/125 [00:28<00:07,  3.67it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  3.54it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  3.79it/s][A
+100%|██████████| 125/125 [00:31<00:00,  4.04it/s][A100%|██████████| 125/125 [00:31<00:00,  3.95it/s]
+
+100%|██████████| 125/125 [00:32<00:00,  3.47it/s][A100%|██████████| 125/125 [00:32<00:00,  3.86it/s]
+
+100%|██████████| 125/125 [00:33<00:00,  3.37it/s][A100%|██████████| 125/125 [00:33<00:00,  3.71it/s]
+
+ 90%|████████▉ | 112/125 [00:34<00:04,  3.24it/s][A
+100%|██████████| 125/125 [00:38<00:00,  3.32it/s][A100%|██████████| 125/125 [00:38<00:00,  3.28it/s]
+ 99%|█████████▉| 1000/1012 [00:38<00:00, 26.15it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.15it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.16it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.15it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.16it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.15it/s] 99%|█████████▉| 1000/1012 [00:38<00:00, 26.15it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s][A[A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:38<00:00, 25.94it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.46it/s][A100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.92it/s][A100%|██████████| 1/1 [00:00<00:00,  1.92it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.54it/s][A100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.48it/s][A100%|██████████| 1/1 [00:00<00:00,  1.48it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A100%|██████████| 2/2 [00:01<00:00,  1.85it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.58it/s][A100%|██████████| 2/2 [00:01<00:00,  1.58it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.28it/s][A100%|██████████| 2/2 [00:01<00:00,  1.28it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.08it/s][A100%|██████████| 2/2 [00:01<00:00,  1.08it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 24.88it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.87it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.87it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.87it/s]100%|██████████| 1012/1012 [00:40<00:00, 25.25it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.88it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 24.87it/s]100%|██████████| 1012/1012 [00:40<00:00, 24.87it/s]100%|██████████| 1012/1012 [00:40<00:00, 25.24it/s]
+[rank1] {'num_prompt_tokens': 28921, 'num_generated_tokens': 3678, 'num_samples': 127, 'runtime': 40.082095842808485, 'samples/s': 3.1684969892308237, 'tokens/s': 91.76166871173992}
+100%|██████████| 1012/1012 [00:40<00:00, 25.24it/s]100%|██████████| 1012/1012 [00:40<00:00, 25.24it/s]100%|██████████| 1012/1012 [00:40<00:00, 25.25it/s]
+
+[rank3] {'num_prompt_tokens': 28563, 'num_generated_tokens': 3526, 'num_samples': 127, 'runtime': 40.1014346703887, 'samples/s': 3.166968988612721, 'tokens/s': 87.92702877046027}
+
+100%|██████████| 1012/1012 [00:40<00:00, 25.24it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 25.24it/s]
+100%|██████████| 1012/1012 [00:40<00:00, 24.94it/s][rank4] {'num_prompt_tokens': 28977, 'num_generated_tokens': 3184, 'num_samples': 126, 'runtime': 40.10160348750651, 'samples/s': 3.1420189978002946, 'tokens/s': 79.3983213412392}
+[rank2] {'num_prompt_tokens': 28690, 'num_generated_tokens': 3543, 'num_samples': 127, 'runtime': 40.101152915507555, 'samples/s': 3.1669912400670084, 'tokens/s': 88.35157451620009}
+[rank7] {'num_prompt_tokens': 27316, 'num_generated_tokens': 3205, 'num_samples': 126, 'runtime': 40.08468414284289, 'samples/s': 3.1433452126252384, 'tokens/s': 79.9557254481261}
+[rank5] {'num_prompt_tokens': 29156, 'num_generated_tokens': 3200, 'num_samples': 126, 'runtime': 40.10134992748499, 'samples/s': 3.1420388647226334, 'tokens/s': 79.79781243740022}
+[rank6] {'num_prompt_tokens': 26138, 'num_generated_tokens': 3266, 'num_samples': 126, 'runtime': 40.10166667960584, 'samples/s': 3.142014046615143, 'tokens/s': 81.44299901781791}
+100%|██████████| 1012/1012 [00:40<00:00, 25.23it/s]
+[rank0] {'num_prompt_tokens': 29088, 'num_generated_tokens': 3853, 'num_samples': 127, 'runtime': 40.1162818223238, 'samples/s': 3.1657968842298687, 'tokens/s': 96.04579051132035}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:40:31.699140
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=hi
++ lp=zh2hi
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/train.log
+[2025-09-15 22:41:06,057] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:41:12,983] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:41:13,046] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:41:13,112] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:41:13,426] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:41:13,567] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:41:13,569] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:41:13,590] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:41:13,594] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2hi.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.87s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:41:17.580545
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s]
+[A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:10<01:12,  1.50it/s][A
+ 13%|█▎        | 16/125 [00:12<01:22,  1.31it/s][A
+ 13%|█▎        | 16/125 [00:12<01:24,  1.29it/s][A
+ 13%|█▎        | 16/125 [00:12<01:27,  1.24it/s][A
+ 13%|█▎        | 16/125 [00:13<01:28,  1.22it/s][A
+ 13%|█▎        | 16/125 [00:18<02:06,  1.16s/it][A
+ 13%|█▎        | 16/125 [00:19<02:14,  1.23s/it][A
+ 26%|██▌       | 32/125 [00:20<00:57,  1.62it/s][A
+ 26%|██▌       | 32/125 [00:21<01:03,  1.46it/s][A
+ 26%|██▌       | 32/125 [00:22<01:03,  1.47it/s][A
+ 26%|██▌       | 32/125 [00:26<01:15,  1.23it/s][A
+ 26%|██▌       | 32/125 [00:28<01:24,  1.10it/s][A
+ 26%|██▌       | 32/125 [00:29<01:21,  1.14it/s][A
+ 38%|███▊      | 48/125 [00:32<00:52,  1.47it/s][A
+ 38%|███▊      | 48/125 [00:34<00:56,  1.37it/s][A
+ 38%|███▊      | 48/125 [00:38<01:01,  1.24it/s][A
+ 38%|███▊      | 48/125 [00:42<01:09,  1.11it/s][A
+ 38%|███▊      | 48/125 [00:43<01:06,  1.16it/s][A
+ 51%|█████     | 64/125 [00:45<00:43,  1.41it/s][A
+ 51%|█████     | 64/125 [00:48<00:49,  1.23it/s][A
+ 51%|█████     | 64/125 [00:52<00:50,  1.21it/s][A
+ 51%|█████     | 64/125 [00:52<00:48,  1.26it/s][A
+ 51%|█████     | 64/125 [01:04<01:04,  1.05s/it][A
+ 64%|██████▍   | 80/125 [01:05<00:36,  1.23it/s][A
+ 64%|██████▍   | 80/125 [01:07<00:42,  1.07it/s][A
+ 77%|███████▋  | 96/125 [01:16<00:22,  1.29it/s][A
+ 77%|███████▋  | 96/125 [01:20<00:25,  1.12it/s][A
+ 90%|████████▉ | 112/125 [01:29<00:10,  1.28it/s][A
+ 13%|█▎        | 16/125 [01:34<10:40,  5.88s/it][A
+ 90%|████████▉ | 112/125 [01:34<00:11,  1.12it/s][A
+100%|██████████| 125/125 [01:38<00:00,  1.32it/s][A100%|██████████| 125/125 [01:38<00:00,  1.27it/s]
+
+ 26%|██▌       | 32/125 [01:44<04:20,  2.80s/it][A
+ 26%|██▌       | 32/125 [01:55<06:13,  4.01s/it][A
+ 38%|███▊      | 48/125 [01:56<02:24,  1.88s/it][A
+ 38%|███▊      | 48/125 [01:58<04:00,  3.13s/it][A
+ 38%|███▊      | 48/125 [02:05<03:10,  2.47s/it][A
+ 51%|█████     | 64/125 [02:09<02:11,  2.16s/it][A
+ 51%|█████     | 64/125 [02:11<01:32,  1.51s/it][A
+ 64%|██████▍   | 80/125 [02:17<01:54,  2.54s/it][A
+ 51%|█████     | 64/125 [02:18<01:50,  1.82s/it][A
+ 64%|██████▍   | 80/125 [02:26<01:57,  2.61s/it][A
+ 64%|██████▍   | 80/125 [02:31<01:06,  1.47s/it][A
+ 64%|██████▍   | 80/125 [02:37<02:04,  2.76s/it][A
+ 77%|███████▋  | 96/125 [02:39<00:58,  2.01s/it][A
+ 77%|███████▋  | 96/125 [02:43<00:35,  1.23s/it][A
+ 77%|███████▋  | 96/125 [02:47<00:59,  2.04s/it][A
+100%|██████████| 125/125 [02:55<00:00,  2.35s/it][A100%|██████████| 125/125 [02:55<00:00,  1.40s/it]
+
+ 90%|████████▉ | 112/125 [02:57<00:14,  1.10s/it][A
+ 90%|████████▉ | 112/125 [02:57<00:20,  1.58s/it][A
+100%|██████████| 125/125 [03:08<00:00,  1.37s/it][A100%|██████████| 125/125 [03:08<00:00,  1.50s/it]
+
+ 64%|██████▍   | 80/125 [03:44<02:17,  3.06s/it][A
+ 64%|██████▍   | 80/125 [03:44<02:38,  3.53s/it][A
+ 77%|███████▋  | 96/125 [03:49<01:44,  3.62s/it][A
+ 77%|███████▋  | 96/125 [03:58<01:15,  2.61s/it][A
+ 90%|████████▉ | 112/125 [04:08<00:25,  1.98s/it][A
+ 90%|████████▉ | 112/125 [04:10<00:41,  3.22s/it][A
+100%|██████████| 125/125 [04:17<00:00,  2.49s/it][A100%|██████████| 125/125 [04:17<00:00,  2.06s/it]
+
+ 77%|███████▋  | 96/125 [05:17<01:55,  4.00s/it][A
+ 90%|████████▉ | 112/125 [05:19<00:55,  4.29s/it][A
+ 90%|████████▉ | 112/125 [05:29<00:38,  2.93s/it][A
+100%|██████████| 125/125 [05:30<00:00,  4.02s/it][A100%|██████████| 125/125 [05:30<00:00,  2.65s/it]
+
+100%|██████████| 125/125 [05:30<00:00,  3.17s/it][A100%|██████████| 125/125 [05:30<00:00,  2.65s/it]
+
+100%|██████████| 125/125 [05:32<00:00,  3.37s/it][A100%|██████████| 125/125 [05:32<00:00,  2.66s/it]
+
+100%|██████████| 125/125 [06:50<00:00,  3.83s/it][A100%|██████████| 125/125 [06:50<00:00,  3.28s/it]
+ 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s] 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s] 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s] 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s] 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s] 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s] 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [06:50<00:04,  2.44it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:02<00:00,  2.17s/it][A100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
+
+100%|██████████| 1/1 [00:02<00:00,  2.95s/it][A100%|██████████| 1/1 [00:02<00:00,  2.95s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.02s/it][A100%|██████████| 1/1 [00:03<00:00,  3.02s/it]
+
+100%|██████████| 1/1 [00:03<00:00,  3.61s/it][A100%|██████████| 1/1 [00:03<00:00,  3.61s/it]
+
+100%|██████████| 2/2 [00:04<00:00,  2.49s/it][A100%|██████████| 2/2 [00:04<00:00,  2.49s/it]
+
+100%|██████████| 2/2 [00:05<00:00,  2.74s/it][A100%|██████████| 2/2 [00:05<00:00,  2.74s/it]
+
+100%|██████████| 2/2 [00:07<00:00,  3.70s/it][A100%|██████████| 2/2 [00:07<00:00,  3.70s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.16s/it][A100%|██████████| 2/2 [00:44<00:00, 22.16s/it]
+100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]
+100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]
+
+100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s][rank7] {'num_prompt_tokens': 8471, 'num_generated_tokens': 22294, 'num_samples': 126, 'runtime': 454.83819127082825, 'samples/s': 0.2770215923336454, 'tokens/s': 49.01523317052611}
+100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.23it/s][rank2] {'num_prompt_tokens': 9782, 'num_generated_tokens': 19234, 'num_samples': 127, 'runtime': 454.8375843670219, 'samples/s': 0.279220548971872, 'tokens/s': 42.28762235374005}
+[rank1] {'num_prompt_tokens': 10289, 'num_generated_tokens': 18195, 'num_samples': 127, 'runtime': 454.8377018235624, 'samples/s': 0.2792204768664164, 'tokens/s': 40.00328013058619}
+
+100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]
+
+100%|██████████| 1012/1012 [07:34<00:00,  2.15it/s][rank3] {'num_prompt_tokens': 10092, 'num_generated_tokens': 17056, 'num_samples': 127, 'runtime': 454.81428357586265, 'samples/s': 0.27923485384296753, 'tokens/s': 37.50102100114689}
+100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]
+[rank6] {'num_prompt_tokens': 8775, 'num_generated_tokens': 18880, 'num_samples': 126, 'runtime': 454.83574973605573, 'samples/s': 0.2770230793712206, 'tokens/s': 41.50948998832258}
+[rank5] {'num_prompt_tokens': 8773, 'num_generated_tokens': 15337, 'num_samples': 126, 'runtime': 454.8382773809135, 'samples/s': 0.27702153988785505, 'tokens/s': 33.719677438571686}
+100%|██████████| 1012/1012 [07:34<00:00,  2.22it/s]
+[rank4] {'num_prompt_tokens': 8577, 'num_generated_tokens': 17189, 'num_samples': 126, 'runtime': 454.8387878276408, 'samples/s': 0.27702122899805803, 'tokens/s': 37.791411946409674}
+[rank0] {'num_prompt_tokens': 10572, 'num_generated_tokens': 19676, 'num_samples': 127, 'runtime': 454.853282103315, 'samples/s': 0.2792109126106148, 'tokens/s': 43.25790485453903}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:48:56.027558
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' th = zh ']'
++ src_lang=th
++ tgt_lang=zh
++ lp=th2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/train.log
+[2025-09-15 22:49:29,469] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:49:35,395] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:49:36,312] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[2025-09-15 22:49:36,691] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:49:36,755] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:49:36,775] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:49:36,846] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:49:36,880] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:49:36,881] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.th2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.36s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.41s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:49:40.925810
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.75it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.90it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.70it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.61it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.54it/s][A
+ 13%|█▎        | 16/125 [00:03<00:26,  4.08it/s][A
+ 13%|█▎        | 16/125 [00:04<00:31,  3.49it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.55it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.82it/s][A
+ 13%|█▎        | 16/125 [00:06<00:41,  2.61it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  4.97it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.83it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.67it/s][A
+ 38%|███▊      | 48/125 [00:07<00:11,  6.67it/s][A
+ 26%|██▌       | 32/125 [00:07<00:22,  4.22it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.09it/s][A
+ 38%|███▊      | 48/125 [00:09<00:14,  5.26it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  5.09it/s][A
+ 26%|██▌       | 32/125 [00:09<00:29,  3.14it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.64it/s][A
+ 51%|█████     | 64/125 [00:10<00:10,  6.02it/s][A
+ 51%|█████     | 64/125 [00:10<00:10,  6.03it/s][A
+ 26%|██▌       | 32/125 [00:11<00:32,  2.87it/s][A
+ 51%|█████     | 64/125 [00:12<00:10,  5.56it/s][A
+ 38%|███▊      | 48/125 [00:13<00:20,  3.79it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.46it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.58it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:08,  5.45it/s][A
+ 38%|███▊      | 48/125 [00:14<00:20,  3.71it/s][A
+ 38%|███▊      | 48/125 [00:14<00:23,  3.22it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:08,  5.34it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:07,  5.92it/s][A
+ 51%|█████     | 64/125 [00:15<00:13,  4.62it/s][A
+ 64%|██████▍   | 80/125 [00:16<00:09,  4.93it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.32it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.40it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.79it/s][A
+ 77%|███████▋  | 96/125 [00:17<00:05,  5.65it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.16it/s][A
+ 51%|█████     | 64/125 [00:18<00:16,  3.65it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:09,  4.62it/s][A
+ 77%|███████▋  | 96/125 [00:19<00:05,  4.93it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.48it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.81it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.02it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.39it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:11,  3.80it/s][A
+100%|██████████| 125/125 [00:22<00:00,  5.22it/s][A100%|██████████| 125/125 [00:22<00:00,  5.48it/s]
+
+ 90%|████████▉ | 112/125 [00:23<00:02,  5.09it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.02it/s][A100%|██████████| 125/125 [00:23<00:00,  5.38it/s]
+
+ 64%|██████▍   | 80/125 [00:23<00:13,  3.29it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:06,  4.17it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.26it/s][A100%|██████████| 125/125 [00:23<00:00,  5.32it/s]
+
+100%|██████████| 125/125 [00:24<00:00,  5.55it/s][A100%|██████████| 125/125 [00:24<00:00,  5.02it/s]
+
+ 90%|████████▉ | 112/125 [00:25<00:03,  4.01it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:06,  4.20it/s][A
+ 77%|███████▋  | 96/125 [00:25<00:07,  3.98it/s][A
+ 90%|████████▉ | 112/125 [00:25<00:02,  4.72it/s][A
+100%|██████████| 125/125 [00:28<00:00,  3.97it/s][A100%|██████████| 125/125 [00:28<00:00,  4.34it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.50it/s][A100%|██████████| 125/125 [00:29<00:00,  4.30it/s]
+
+ 90%|████████▉ | 112/125 [00:29<00:03,  4.16it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:03,  3.96it/s][A
+100%|██████████| 125/125 [00:33<00:00,  3.89it/s][A100%|██████████| 125/125 [00:33<00:00,  3.74it/s]
+
+100%|██████████| 125/125 [00:34<00:00,  3.65it/s][A100%|██████████| 125/125 [00:34<00:00,  3.67it/s]
+ 99%|█████████▉| 1000/1012 [00:34<00:00, 29.28it/s] 99%|█████████▉| 1000/1012 [00:34<00:00, 29.27it/s] 99%|█████████▉| 1000/1012 [00:34<00:00, 29.27it/s] 99%|█████████▉| 1000/1012 [00:34<00:00, 29.28it/s] 99%|█████████▉| 1000/1012 [00:34<00:00, 29.27it/s] 99%|█████████▉| 1000/1012 [00:34<00:00, 29.28it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:34<00:00, 29.08it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [00:34<00:00, 28.99it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.06it/s][A100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  2.05it/s][A100%|██████████| 1/1 [00:00<00:00,  2.05it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.81it/s][A100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.04it/s][A100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.83it/s][A100%|██████████| 2/2 [00:01<00:00,  1.83it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.42it/s][A100%|██████████| 2/2 [00:01<00:00,  1.42it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.13it/s][A100%|██████████| 2/2 [00:01<00:00,  1.13it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.26it/s][A100%|██████████| 2/2 [00:01<00:00,  1.26it/s]
+100%|██████████| 1012/1012 [00:36<00:00, 27.54it/s]100%|██████████| 1012/1012 [00:36<00:00, 27.67it/s]100%|██████████| 1012/1012 [00:36<00:00, 27.54it/s]100%|██████████| 1012/1012 [00:36<00:00, 28.03it/s]100%|██████████| 1012/1012 [00:36<00:00, 27.54it/s]100%|██████████| 1012/1012 [00:36<00:00, 27.54it/s]100%|██████████| 1012/1012 [00:36<00:00, 27.55it/s]
+100%|██████████| 1012/1012 [00:36<00:00, 27.55it/s]100%|██████████| 1012/1012 [00:36<00:00, 28.03it/s]100%|██████████| 1012/1012 [00:36<00:00, 28.05it/s]
+
+[rank7] {'num_prompt_tokens': 16654, 'num_generated_tokens': 3126, 'num_samples': 126, 'runtime': 36.10118059441447, 'samples/s': 3.4901905678811667, 'tokens/s': 86.58996599362322}
+100%|██████████| 1012/1012 [00:36<00:00, 28.03it/s]100%|██████████| 1012/1012 [00:36<00:00, 28.04it/s]100%|██████████| 1012/1012 [00:36<00:00, 28.03it/s]
+
+
+[rank2] {'num_prompt_tokens': 17613, 'num_generated_tokens': 3570, 'num_samples': 127, 'runtime': 36.10080772265792, 'samples/s': 3.5179268279997817, 'tokens/s': 98.88975414141119}
+[rank1] {'num_prompt_tokens': 19339, 'num_generated_tokens': 3857, 'num_samples': 127, 'runtime': 36.083151368424296, 'samples/s': 3.519648234248613, 'tokens/s': 106.89199401178662}
+100%|██████████| 1012/1012 [00:36<00:00, 28.04it/s]
+[rank4] {'num_prompt_tokens': 17731, 'num_generated_tokens': 3229, 'num_samples': 126, 'runtime': 36.10096452385187, 'samples/s': 3.4902114572798166, 'tokens/s': 89.443593615528}
+[rank6] {'num_prompt_tokens': 16331, 'num_generated_tokens': 3384, 'num_samples': 126, 'runtime': 36.08924945257604, 'samples/s': 3.491344428361509, 'tokens/s': 93.76753607599481}
+[rank3] {'num_prompt_tokens': 18962, 'num_generated_tokens': 3538, 'num_samples': 127, 'runtime': 36.10052549652755, 'samples/s': 3.5179543303937755, 'tokens/s': 98.00411355065494}
+100%|██████████| 1012/1012 [00:36<00:00, 27.60it/s][rank5] {'num_prompt_tokens': 16286, 'num_generated_tokens': 3368, 'num_samples': 126, 'runtime': 36.08918829634786, 'samples/s': 3.4913503447444096, 'tokens/s': 93.32434889761247}
+100%|██████████| 1012/1012 [00:36<00:00, 28.02it/s]
+[rank0] {'num_prompt_tokens': 19742, 'num_generated_tokens': 3876, 'num_samples': 127, 'runtime': 36.11572103574872, 'samples/s': 3.5164741657598513, 'tokens/s': 107.32168398807232}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:50:20.648027
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=th
++ lp=zh2th
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/train.log
+[2025-09-15 22:50:53,453] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:51:00,477] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:51:00,480] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:51:00,576] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:51:00,621] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:51:00,907] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:51:00,966] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:51:00,975] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:51:00,985] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2th.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.10s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.59s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.60s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.59s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:51:05.066273
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.47s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.47s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.49s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.49s/it]
+
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:05<00:36,  2.96it/s][A
+ 13%|█▎        | 16/125 [00:05<00:38,  2.80it/s][A
+ 13%|█▎        | 16/125 [00:06<00:41,  2.61it/s][A
+ 13%|█▎        | 16/125 [00:06<00:47,  2.30it/s][A
+ 13%|█▎        | 16/125 [00:07<00:50,  2.16it/s][A
+ 13%|█▎        | 16/125 [00:08<00:59,  1.82it/s][A
+ 26%|██▌       | 32/125 [00:11<00:34,  2.69it/s][A
+ 26%|██▌       | 32/125 [00:11<00:32,  2.83it/s][A
+ 26%|██▌       | 32/125 [00:12<00:36,  2.54it/s][A
+ 26%|██▌       | 32/125 [00:15<00:42,  2.18it/s][A
+ 38%|███▊      | 48/125 [00:16<00:25,  3.02it/s][A
+ 26%|██▌       | 32/125 [00:16<00:50,  1.85it/s][A
+ 38%|███▊      | 48/125 [00:17<00:28,  2.75it/s][A
+ 38%|███▊      | 48/125 [00:20<00:34,  2.21it/s][A
+ 38%|███▊      | 48/125 [00:22<00:36,  2.11it/s][A
+ 51%|█████     | 64/125 [00:23<00:22,  2.66it/s][A
+ 51%|█████     | 64/125 [00:23<00:23,  2.63it/s][A
+ 51%|█████     | 64/125 [00:27<00:27,  2.23it/s][A
+ 64%|██████▍   | 80/125 [00:31<00:19,  2.35it/s][A
+ 64%|██████▍   | 80/125 [00:32<00:19,  2.26it/s][A
+ 51%|█████     | 64/125 [00:34<00:35,  1.73it/s][A
+ 64%|██████▍   | 80/125 [00:35<00:20,  2.17it/s][A
+ 77%|███████▋  | 96/125 [00:39<00:12,  2.24it/s][A
+ 77%|███████▋  | 96/125 [00:40<00:12,  2.23it/s][A
+ 64%|██████▍   | 80/125 [00:42<00:24,  1.85it/s][A
+ 77%|███████▋  | 96/125 [00:42<00:13,  2.23it/s][A
+ 90%|████████▉ | 112/125 [00:45<00:05,  2.34it/s][A
+ 90%|████████▉ | 112/125 [00:47<00:05,  2.25it/s][A
+ 77%|███████▋  | 96/125 [00:48<00:14,  2.06it/s][A
+100%|██████████| 125/125 [00:50<00:00,  2.41it/s][A100%|██████████| 125/125 [00:50<00:00,  2.45it/s]
+
+100%|██████████| 125/125 [00:53<00:00,  2.20it/s][A100%|██████████| 125/125 [00:53<00:00,  2.34it/s]
+
+ 90%|████████▉ | 112/125 [00:54<00:05,  2.22it/s][A
+100%|██████████| 125/125 [01:00<00:00,  2.25it/s][A100%|██████████| 125/125 [01:00<00:00,  2.08it/s]
+
+ 13%|█▎        | 16/125 [01:31<10:22,  5.71s/it][A
+ 13%|█▎        | 16/125 [01:32<10:32,  5.80s/it][A
+ 26%|██▌       | 32/125 [01:36<05:23,  3.48s/it][A
+ 38%|███▊      | 48/125 [01:41<02:36,  2.04s/it][A
+ 26%|██▌       | 32/125 [01:45<04:24,  2.84s/it][A
+ 51%|█████     | 64/125 [01:46<01:22,  1.36s/it][A
+ 38%|███▊      | 48/125 [01:48<03:44,  2.92s/it][A
+ 38%|███▊      | 48/125 [01:51<02:12,  1.72s/it][A
+ 64%|██████▍   | 80/125 [01:52<00:44,  1.00it/s][A
+ 51%|█████     | 64/125 [01:56<01:59,  1.96s/it][A
+ 51%|█████     | 64/125 [01:58<01:14,  1.23s/it][A
+ 77%|███████▋  | 96/125 [02:00<00:24,  1.20it/s][A
+ 64%|██████▍   | 80/125 [02:03<01:03,  1.41s/it][A
+ 64%|██████▍   | 80/125 [02:05<00:42,  1.06it/s][A
+ 90%|████████▉ | 112/125 [02:08<00:09,  1.39it/s][A
+ 77%|███████▋  | 96/125 [02:10<00:31,  1.08s/it][A
+ 77%|███████▋  | 96/125 [02:12<00:22,  1.31it/s][A
+100%|██████████| 125/125 [02:16<00:00,  1.44it/s][A100%|██████████| 125/125 [02:16<00:00,  1.09s/it]
+
+ 90%|████████▉ | 112/125 [02:17<00:29,  2.25s/it][A
+ 90%|████████▉ | 112/125 [02:21<00:08,  1.45it/s][A
+ 90%|████████▉ | 112/125 [02:22<00:12,  1.03it/s][A
+100%|██████████| 125/125 [02:25<00:00,  1.80s/it][A100%|██████████| 125/125 [02:25<00:00,  1.17s/it]
+
+100%|██████████| 125/125 [02:26<00:00,  1.63it/s][A100%|██████████| 125/125 [02:26<00:00,  1.17s/it]
+
+100%|██████████| 125/125 [02:29<00:00,  1.18it/s][A100%|██████████| 125/125 [02:29<00:00,  1.20s/it]
+
+ 26%|██▌       | 32/125 [03:04<08:56,  5.77s/it][A
+ 38%|███▊      | 48/125 [03:10<04:15,  3.32s/it][A
+ 51%|█████     | 64/125 [03:19<02:15,  2.22s/it][A
+ 64%|██████▍   | 80/125 [03:26<01:11,  1.58s/it][A
+ 77%|███████▋  | 96/125 [03:34<00:34,  1.20s/it][A
+ 90%|████████▉ | 112/125 [03:39<00:12,  1.08it/s][A
+100%|██████████| 125/125 [03:48<00:00,  1.18it/s][A100%|██████████| 125/125 [03:48<00:00,  1.83s/it]
+ 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s] 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s] 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s] 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s] 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s] 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [03:48<00:02,  4.38it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:01<00:00,  1.34s/it][A100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.71s/it][A100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.89s/it][A100%|██████████| 1/1 [00:01<00:00,  1.89s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.78s/it][A100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.25s/it][A100%|██████████| 2/2 [00:02<00:00,  1.25s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.59s/it][A100%|██████████| 2/2 [00:03<00:00,  1.59s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.62s/it][A100%|██████████| 2/2 [00:03<00:00,  1.62s/it]
+
+100%|██████████| 2/2 [00:04<00:00,  2.15s/it][A100%|██████████| 2/2 [00:04<00:00,  2.15s/it]
+100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]
+
+100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]
+100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]
+
+100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s][rank7] {'num_prompt_tokens': 8345, 'num_generated_tokens': 8552, 'num_samples': 126, 'runtime': 232.58611488342285, 'samples/s': 0.5417348325507475, 'tokens/s': 36.76917688868248}
+[rank5] {'num_prompt_tokens': 8647, 'num_generated_tokens': 8183, 'num_samples': 126, 'runtime': 232.56373004429042, 'samples/s': 0.5417869758797041, 'tokens/s': 35.18605415574301}
+[rank2] {'num_prompt_tokens': 9655, 'num_generated_tokens': 10444, 'num_samples': 127, 'runtime': 232.57554243505, 'samples/s': 0.5460591370456184, 'tokens/s': 44.90583958507432}
+[rank1] {'num_prompt_tokens': 10162, 'num_generated_tokens': 9297, 'num_samples': 127, 'runtime': 232.58702825568616, 'samples/s': 0.546032171064962, 'tokens/s': 39.97213460150356}[rank4] {'num_prompt_tokens': 8451, 'num_generated_tokens': 8833, 'num_samples': 126, 'runtime': 232.58734798245132, 'samples/s': 0.5417319604568804, 'tokens/s': 37.97713021202877}
+
+100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]
+
+[rank3] {'num_prompt_tokens': 9965, 'num_generated_tokens': 10324, 'num_samples': 127, 'runtime': 232.58738213591278, 'samples/s': 0.5460313402804773, 'tokens/s': 44.38761855949329}[rank6] {'num_prompt_tokens': 8649, 'num_generated_tokens': 9860, 'num_samples': 126, 'runtime': 232.57578401640058, 'samples/s': 0.5417588960642388, 'tokens/s': 42.39478345391583}
+
+100%|██████████| 1012/1012 [03:52<00:00,  4.34it/s]100%|██████████| 1012/1012 [03:52<00:00,  4.35it/s]
+[rank0] {'num_prompt_tokens': 10445, 'num_generated_tokens': 10412, 'num_samples': 127, 'runtime': 232.60278106480837, 'samples/s': 0.5459951915390683, 'tokens/s': 44.76300735673054}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:55:01.283251
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' jv = zh ']'
++ src_lang=jv
++ tgt_lang=zh
++ lp=jv2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/train.log
+[2025-09-15 22:55:34,478] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:55:40,637] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:55:41,451] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[2025-09-15 22:55:41,498] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 22:55:41,573] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:55:41,742] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:55:41,780] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:55:41,867] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:55:41,874] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.jv2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.34s/it]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:55:45.851072
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:18,  5.85it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.47it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.93it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.86it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.61it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.22it/s][A
+ 26%|██▌       | 32/125 [00:04<00:12,  7.18it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.31it/s][A
+ 26%|██▌       | 32/125 [00:05<00:14,  6.33it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.89it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.02it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.77it/s][A
+ 38%|███▊      | 48/125 [00:07<00:12,  6.25it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.65it/s][A
+ 38%|███▊      | 48/125 [00:09<00:13,  5.51it/s][A
+ 26%|██▌       | 32/125 [00:09<00:28,  3.25it/s][A
+ 51%|█████     | 64/125 [00:10<00:10,  6.07it/s][A
+ 38%|███▊      | 48/125 [00:10<00:18,  4.22it/s][A
+ 38%|███▊      | 48/125 [00:10<00:17,  4.50it/s][A
+ 26%|██▌       | 32/125 [00:10<00:32,  2.85it/s][A
+ 51%|█████     | 64/125 [00:11<00:10,  5.77it/s][A
+ 38%|███▊      | 48/125 [00:12<00:18,  4.08it/s][A
+ 64%|██████▍   | 80/125 [00:13<00:07,  5.96it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.69it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.52it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.67it/s][A
+ 38%|███▊      | 48/125 [00:13<00:22,  3.47it/s][A
+ 64%|██████▍   | 80/125 [00:14<00:07,  5.95it/s][A
+ 77%|███████▋  | 96/125 [00:15<00:04,  5.86it/s][A
+ 51%|█████     | 64/125 [00:16<00:14,  4.33it/s][A
+ 51%|█████     | 64/125 [00:17<00:15,  3.86it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.26it/s][A
+ 64%|██████▍   | 80/125 [00:17<00:10,  4.27it/s][A
+ 77%|███████▋  | 96/125 [00:18<00:05,  5.16it/s][A
+ 90%|████████▉ | 112/125 [00:18<00:02,  5.94it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.48it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:05,  4.89it/s][A
+ 90%|████████▉ | 112/125 [00:20<00:02,  5.37it/s][A
+ 77%|███████▋  | 96/125 [00:20<00:06,  4.42it/s][A
+ 64%|██████▍   | 80/125 [00:21<00:11,  3.89it/s][A
+100%|██████████| 125/125 [00:21<00:00,  5.29it/s][A100%|██████████| 125/125 [00:21<00:00,  5.75it/s]
+
+ 90%|████████▉ | 112/125 [00:22<00:02,  5.46it/s][A
+100%|██████████| 125/125 [00:23<00:00,  5.42it/s][A100%|██████████| 125/125 [00:23<00:00,  5.41it/s]
+
+ 90%|████████▉ | 112/125 [00:24<00:02,  4.61it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  4.11it/s][A
+100%|██████████| 125/125 [00:24<00:00,  5.70it/s][A100%|██████████| 125/125 [00:24<00:00,  5.14it/s]
+
+ 77%|███████▋  | 96/125 [00:24<00:07,  4.07it/s][A
+ 90%|████████▉ | 112/125 [00:26<00:02,  4.89it/s][A
+100%|██████████| 125/125 [00:28<00:00,  3.96it/s][A100%|██████████| 125/125 [00:28<00:00,  4.37it/s]
+
+100%|██████████| 125/125 [00:29<00:00,  4.70it/s][A100%|██████████| 125/125 [00:29<00:00,  4.28it/s]
+
+ 90%|████████▉ | 112/125 [00:29<00:03,  3.68it/s][A
+100%|██████████| 125/125 [00:33<00:00,  3.75it/s][A100%|██████████| 125/125 [00:33<00:00,  3.77it/s]
+
+ 13%|█▎        | 16/125 [01:32<10:27,  5.75s/it][A
+ 26%|██▌       | 32/125 [01:35<03:50,  2.48s/it][A
+ 38%|███▊      | 48/125 [01:37<01:49,  1.42s/it][A
+ 51%|█████     | 64/125 [01:40<00:57,  1.07it/s][A
+ 64%|██████▍   | 80/125 [01:43<00:30,  1.50it/s][A
+ 77%|███████▋  | 96/125 [01:46<00:14,  2.00it/s][A
+ 90%|████████▉ | 112/125 [01:48<00:04,  2.62it/s][A
+ 64%|██████▍   | 80/125 [01:49<01:42,  2.29s/it][A
+100%|██████████| 125/125 [01:50<00:00,  3.12it/s][A100%|██████████| 125/125 [01:50<00:00,  1.13it/s]
+
+ 77%|███████▋  | 96/125 [01:52<00:45,  1.57s/it][A
+ 90%|████████▉ | 112/125 [01:57<00:15,  1.17s/it][A
+100%|██████████| 125/125 [02:00<00:00,  1.10it/s][A100%|██████████| 125/125 [02:00<00:00,  1.04it/s]
+ 99%|█████████▉| 1000/1012 [02:00<00:01,  8.30it/s] 99%|█████████▉| 1000/1012 [02:00<00:01,  8.30it/s] 99%|█████████▉| 1000/1012 [02:00<00:01,  8.30it/s] 99%|█████████▉| 1000/1012 [02:00<00:01,  8.30it/s] 99%|█████████▉| 1000/1012 [02:00<00:01,  8.30it/s] 99%|█████████▉| 1000/1012 [02:00<00:01,  8.30it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:00<00:01,  8.28it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:00<00:01,  8.28it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.15it/s][A100%|██████████| 1/1 [00:00<00:00,  2.15it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.80it/s][A100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.64it/s][A100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.37it/s][A100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
+
+100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A100%|██████████| 2/2 [00:00<00:00,  2.03it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A100%|██████████| 2/2 [00:01<00:00,  1.80it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.29it/s][A100%|██████████| 2/2 [00:01<00:00,  1.29it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.31it/s][A100%|██████████| 2/2 [00:01<00:00,  1.31it/s]
+100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]
+100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]
+
+100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]
+100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]
+
+[rank7] {'num_prompt_tokens': 12150, 'num_generated_tokens': 4113, 'num_samples': 126, 'runtime': 122.28037379309535, 'samples/s': 1.0304188324873658, 'tokens/s': 33.635814746194725}[rank1] {'num_prompt_tokens': 12953, 'num_generated_tokens': 3801, 'num_samples': 127, 'runtime': 122.26114082336426, 'samples/s': 1.0387601419774266, 'tokens/s': 31.08919133587558}
+
+[rank3] {'num_prompt_tokens': 13300, 'num_generated_tokens': 4449, 'num_samples': 127, 'runtime': 122.28354782238603, 'samples/s': 1.0385698015931342, 'tokens/s': 36.3826539156524}
+[rank6] {'num_prompt_tokens': 11871, 'num_generated_tokens': 3380, 'num_samples': 126, 'runtime': 122.28385027498007, 'samples/s': 1.0303895380842476, 'tokens/s': 27.640608243847275}
+[rank2] {'num_prompt_tokens': 13763, 'num_generated_tokens': 3544, 'num_samples': 127, 'runtime': 122.28243673592806, 'samples/s': 1.0385792382781809, 'tokens/s': 28.982085200455693}
+[rank4] {'num_prompt_tokens': 13171, 'num_generated_tokens': 3258, 'num_samples': 126, 'runtime': 122.2840594239533, 'samples/s': 1.0303877757538593, 'tokens/s': 26.64288391592122}
+100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]100%|██████████| 1012/1012 [02:02<00:00,  8.28it/s]
+[rank5] {'num_prompt_tokens': 12413, 'num_generated_tokens': 3319, 'num_samples': 126, 'runtime': 122.28427246026695, 'samples/s': 1.0303859806741735, 'tokens/s': 27.14167515759986}
+100%|██████████| 1012/1012 [02:02<00:00,  8.27it/s]
+[rank0] {'num_prompt_tokens': 14139, 'num_generated_tokens': 3907, 'num_samples': 127, 'runtime': 122.29887679219246, 'samples/s': 1.0384396270114205, 'tokens/s': 31.946327738059995}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 22:57:51.717942
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=jv
++ lp=zh2jv
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/train.log
+[2025-09-15 22:58:24,743] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 22:58:31,611] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:58:31,626] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:58:31,942] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:58:31,991] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:58:32,091] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:58:32,129] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 22:58:32,163] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 22:58:32,212] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2jv.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.66s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.00s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.24s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 22:58:36.251033
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:04<00:32,  3.32it/s][A
+ 13%|█▎        | 16/125 [00:05<00:35,  3.04it/s][A
+ 13%|█▎        | 16/125 [00:05<00:36,  2.99it/s][A
+ 26%|██▌       | 32/125 [00:10<00:29,  3.21it/s][A
+ 13%|█▎        | 16/125 [01:32<10:29,  5.77s/it][A
+ 13%|█▎        | 16/125 [01:32<10:30,  5.79s/it][A
+ 13%|█▎        | 16/125 [01:33<10:37,  5.84s/it][A
+ 13%|█▎        | 16/125 [01:33<10:37,  5.85s/it][A
+ 13%|█▎        | 16/125 [01:35<10:53,  5.99s/it][A
+ 26%|██▌       | 32/125 [01:37<03:58,  2.56s/it][A
+ 26%|██▌       | 32/125 [01:37<03:58,  2.56s/it][A
+ 26%|██▌       | 32/125 [01:39<05:35,  3.60s/it][A
+ 26%|██▌       | 32/125 [01:40<05:38,  3.64s/it][A
+ 38%|███▊      | 48/125 [01:41<03:34,  2.79s/it][A
+ 38%|███▊      | 48/125 [01:42<01:57,  1.53s/it][A
+ 26%|██▌       | 32/125 [01:42<04:13,  2.72s/it][A
+ 38%|███▊      | 48/125 [01:46<02:02,  1.59s/it][A
+ 51%|█████     | 64/125 [01:51<01:05,  1.08s/it][A
+ 26%|██▌       | 32/125 [03:01<08:47,  5.67s/it][A
+ 26%|██▌       | 32/125 [03:03<08:54,  5.74s/it][A
+ 38%|███▊      | 48/125 [03:06<04:06,  3.21s/it][A
+ 38%|███▊      | 48/125 [03:10<05:11,  4.04s/it][A
+ 51%|█████     | 64/125 [03:10<02:05,  2.05s/it][A
+ 51%|█████     | 64/125 [03:12<03:58,  3.92s/it][A
+ 38%|███▊      | 48/125 [03:12<05:55,  4.62s/it][A
+ 38%|███▊      | 48/125 [03:13<05:55,  4.62s/it][A
+ 51%|█████     | 64/125 [03:15<03:16,  3.22s/it][A
+ 51%|█████     | 64/125 [03:15<02:36,  2.57s/it][A
+ 51%|█████     | 64/125 [03:17<02:57,  2.90s/it][A
+ 64%|██████▍   | 80/125 [03:19<01:37,  2.16s/it][A
+ 64%|██████▍   | 80/125 [03:20<01:19,  1.76s/it][A
+ 64%|██████▍   | 80/125 [03:22<01:28,  1.97s/it][A
+ 64%|██████▍   | 80/125 [03:24<02:05,  2.79s/it][A
+ 77%|███████▋  | 96/125 [03:24<00:44,  1.53s/it][A
+ 77%|███████▋  | 96/125 [03:29<00:42,  1.45s/it][A
+ 90%|████████▉ | 112/125 [03:33<00:13,  1.05s/it][A
+ 38%|███▊      | 48/125 [04:37<07:24,  5.78s/it][A
+ 51%|█████     | 64/125 [04:41<03:40,  3.61s/it][A
+ 64%|██████▍   | 80/125 [04:42<02:31,  3.37s/it][A
+ 64%|██████▍   | 80/125 [04:43<03:25,  4.56s/it][A
+ 51%|█████     | 64/125 [04:45<05:10,  5.09s/it][A
+ 77%|███████▋  | 96/125 [04:46<01:07,  2.32s/it][A
+ 77%|███████▋  | 96/125 [04:52<01:30,  3.11s/it][A
+100%|██████████| 125/125 [04:53<00:00,  2.45s/it][A100%|██████████| 125/125 [04:53<00:00,  2.34s/it]
+
+ 90%|████████▉ | 112/125 [04:54<00:37,  2.86s/it][A
+ 77%|███████▋  | 96/125 [04:57<01:50,  3.82s/it][A
+100%|██████████| 125/125 [04:58<00:00,  2.15s/it][A100%|██████████| 125/125 [04:58<00:00,  2.38s/it]
+
+ 77%|███████▋  | 96/125 [06:14<02:23,  4.95s/it][A
+ 64%|██████▍   | 80/125 [06:15<03:19,  4.43s/it][A
+ 90%|████████▉ | 112/125 [06:17<00:44,  3.42s/it][A
+ 64%|██████▍   | 80/125 [06:19<04:01,  5.36s/it][A
+ 77%|███████▋  | 96/125 [06:20<01:27,  3.02s/it][A
+100%|██████████| 125/125 [06:21<00:00,  2.56s/it][A100%|██████████| 125/125 [06:21<00:00,  3.05s/it]
+
+ 90%|████████▉ | 112/125 [06:26<00:52,  4.03s/it][A
+ 90%|████████▉ | 112/125 [06:31<00:58,  4.49s/it][A
+ 90%|████████▉ | 112/125 [07:45<01:07,  5.18s/it][A
+100%|██████████| 125/125 [07:46<00:00,  4.60s/it][A100%|██████████| 125/125 [07:46<00:00,  3.73s/it]
+
+100%|██████████| 125/125 [07:53<00:00,  4.98s/it][A100%|██████████| 125/125 [07:53<00:00,  3.79s/it]
+
+ 90%|████████▉ | 112/125 [07:53<00:51,  3.93s/it][A
+ 77%|███████▋  | 96/125 [07:53<02:40,  5.54s/it][A
+100%|██████████| 125/125 [09:04<00:00,  5.44s/it][A100%|██████████| 125/125 [09:04<00:00,  4.36s/it]
+
+100%|██████████| 125/125 [09:13<00:00,  4.56s/it][A100%|██████████| 125/125 [09:13<00:00,  4.43s/it]
+
+ 90%|████████▉ | 112/125 [09:26<01:13,  5.63s/it][A
+100%|██████████| 125/125 [09:30<00:00,  4.16s/it][A100%|██████████| 125/125 [09:30<00:00,  4.56s/it]
+ 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s] 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s] 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s] 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s] 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s] 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s] 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s]
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s]
+[A  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [09:30<00:06,  1.75it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.07it/s][A100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
+
+100%|██████████| 1/1 [00:01<00:00,  1.10s/it][A100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.14s/it][A100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.30s/it][A100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
+
+100%|██████████| 2/2 [00:01<00:00,  1.06it/s][A100%|██████████| 2/2 [00:01<00:00,  1.06it/s]
+
+100%|██████████| 2/2 [00:02<00:00,  1.09s/it][A100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.14s/it][A100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.60s/it][A100%|██████████| 2/2 [00:03<00:00,  1.60s/it]
+100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]
+100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]
+100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]
+
+100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]
+[rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 14878, 'num_samples': 126, 'runtime': 573.6752871610224, 'samples/s': 0.21963644385579678, 'tokens/s': 25.9345318387821}
+100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s][rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 15174, 'num_samples': 126, 'runtime': 573.6967382691801, 'samples/s': 0.21962823142438795, 'tokens/s': 26.44951415582272}
+[rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 11787, 'num_samples': 126, 'runtime': 573.697110209614, 'samples/s': 0.21962808903458284, 'tokens/s': 20.545684805163713}
+
+[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 13125, 'num_samples': 127, 'runtime': 573.6956671364605, 'samples/s': 0.22137172594296672, 'tokens/s': 22.8779834882003}
+[rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 14551, 'num_samples': 127, 'runtime': 573.6965487580746, 'samples/s': 0.2213713857524971, 'tokens/s': 25.363582945547915}
+100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s][rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 10004, 'num_samples': 126, 'runtime': 573.6966658551246, 'samples/s': 0.21962825914665282, 'tokens/s': 17.437786543675514}
+100%|██████████| 1012/1012 [09:33<00:00,  1.77it/s]100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]
+100%|██████████| 1012/1012 [09:33<00:00,  1.76it/s]
+[rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 9822, 'num_samples': 127, 'runtime': 573.6964201983064, 'samples/s': 0.22137143535966394, 'tokens/s': 17.120553055926138}
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 14241, 'num_samples': 127, 'runtime': 573.7118388321251, 'samples/s': 0.2213654859528909, 'tokens/s': 24.822566027205667}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 23:08:13.816487
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' sw = zh ']'
++ src_lang=sw
++ tgt_lang=zh
++ lp=sw2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/generated_predictions.jsonl
+[2025-09-15 23:08:47,003] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 23:08:54,050] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:08:54,061] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:08:54,165] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:08:54,172] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:08:54,506] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:08:54,524] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:08:54,539] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:08:54,540] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.sw2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.39s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.42s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.40s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.40s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 23:08:58.386316
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.05it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]
+
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.39s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/125 [00:00<?, ?it/s][A[A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.17it/s][A
+ 13%|█▎        | 16/125 [00:02<00:17,  6.11it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.55it/s][A
+ 13%|█▎        | 16/125 [00:03<00:21,  5.13it/s][A
+ 13%|█▎        | 16/125 [00:03<00:23,  4.66it/s][A
+ 13%|█▎        | 16/125 [00:04<00:33,  3.22it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  6.15it/s][A
+ 26%|██▌       | 32/125 [00:05<00:15,  5.96it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.79it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.65it/s][A
+ 38%|███▊      | 48/125 [00:08<00:13,  5.87it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.34it/s][A
+ 51%|█████     | 64/125 [00:11<00:11,  5.53it/s][A
+ 38%|███▊      | 48/125 [00:12<00:18,  4.18it/s][A
+ 51%|█████     | 64/125 [00:16<00:15,  3.85it/s][A
+ 64%|██████▍   | 80/125 [00:21<00:11,  3.80it/s][A
+ 77%|███████▋  | 96/125 [00:24<00:07,  4.12it/s][A
+ 13%|█▎        | 16/125 [01:33<10:38,  5.86s/it][A
+ 13%|█▎        | 16/125 [01:33<10:39,  5.86s/it][A
+ 26%|██▌       | 32/125 [01:35<03:51,  2.49s/it][A
+ 26%|██▌       | 32/125 [01:37<03:55,  2.54s/it][A
+ 38%|███▊      | 48/125 [01:39<03:33,  2.77s/it][A
+ 38%|███▊      | 48/125 [01:39<03:34,  2.78s/it][A
+ 26%|██▌       | 32/125 [01:40<05:38,  3.64s/it][A
+ 38%|███▊      | 48/125 [01:40<01:52,  1.47s/it][A
+ 38%|███▊      | 48/125 [01:41<03:38,  2.84s/it][A
+ 51%|█████     | 64/125 [01:41<01:46,  1.74s/it][A
+ 51%|█████     | 64/125 [01:42<00:57,  1.06it/s][A
+ 51%|█████     | 64/125 [01:43<01:47,  1.77s/it][A
+ 38%|███▊      | 48/125 [01:43<02:40,  2.09s/it][A
+ 51%|█████     | 64/125 [01:44<01:49,  1.79s/it][A
+ 64%|██████▍   | 80/125 [01:45<00:29,  1.51it/s][A
+ 64%|██████▍   | 80/125 [01:45<00:53,  1.20s/it][A
+ 51%|█████     | 64/125 [01:46<01:20,  1.32s/it][A
+ 64%|���█████▍   | 80/125 [01:47<01:42,  2.27s/it][A
+ 77%|███████▋  | 96/125 [01:48<00:14,  2.00it/s][A
+ 77%|███████▋  | 96/125 [01:48<00:24,  1.18it/s][A
+ 77%|███████▋  | 96/125 [01:49<00:45,  1.56s/it][A
+ 64%|██████▍   | 80/125 [01:49<00:41,  1.08it/s][A
+ 90%|████████▉ | 112/125 [01:51<00:05,  2.49it/s][A
+ 90%|████████▉ | 112/125 [01:51<00:08,  1.58it/s][A
+100%|██████████| 125/125 [01:53<00:00,  2.91it/s][A100%|██████████| 125/125 [01:53<00:00,  1.10it/s]
+
+ 77%|███████▋  | 96/125 [01:54<00:20,  1.42it/s][A
+100%|██████████| 125/125 [01:55<00:00,  1.84it/s][A100%|██████████| 125/125 [01:55<00:00,  1.08it/s]
+
+ 90%|████████▉ | 112/125 [01:56<00:06,  1.90it/s][A
+100%|██████████| 125/125 [01:59<00:00,  2.27it/s][A100%|██████████| 125/125 [01:59<00:00,  1.05it/s]
+
+ 90%|████████▉ | 112/125 [01:59<00:27,  2.11s/it][A
+100%|██████████| 125/125 [02:02<00:00,  1.60s/it][A100%|██████████| 125/125 [02:02<00:00,  1.02it/s]
+
+ 38%|███▊      | 48/125 [03:07<05:06,  3.98s/it][A
+ 51%|█████     | 64/125 [03:12<02:33,  2.51s/it][A
+ 64%|██████▍   | 80/125 [03:15<02:24,  3.21s/it][A
+ 64%|██████▍   | 80/125 [03:16<01:16,  1.70s/it][A
+ 77%|███████▋  | 96/125 [03:18<00:33,  1.17s/it][A
+ 77%|███████▋  | 96/125 [03:19<01:03,  2.20s/it][A
+ 64%|██████▍   | 80/125 [03:19<02:27,  3.29s/it][A
+ 90%|████████▉ | 112/125 [03:20<00:10,  1.21it/s][A
+ 90%|████████▉ | 112/125 [03:22<00:38,  2.93s/it][A
+ 77%|███████▋  | 96/125 [03:22<01:04,  2.23s/it][A
+100%|██████████| 125/125 [03:22<00:00,  1.55it/s][A100%|██████████| 125/125 [03:22<00:00,  1.62s/it]
+
+100%|██████████| 125/125 [03:24<00:00,  2.17s/it][A100%|██████████| 125/125 [03:24<00:00,  1.63s/it]
+
+ 90%|████████▉ | 112/125 [03:24<00:20,  1.59s/it][A
+ 90%|████████▉ | 112/125 [03:25<00:20,  1.57s/it][A
+100%|██████████| 125/125 [03:27<00:00,  1.22s/it][A100%|██████████| 125/125 [03:27<00:00,  1.66s/it]
+
+100%|██████████| 125/125 [03:29<00:00,  1.21s/it][A100%|██████████| 125/125 [03:29<00:00,  1.67s/it]
+ 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s] 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s] 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s] 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s] 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s] 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s] 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s]
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [03:29<00:02,  4.78it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.90it/s][A100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.85it/s][A100%|██████████| 1/1 [00:00<00:00,  1.85it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.53it/s][A100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.54it/s][A100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A100%|██████████| 2/2 [00:01<00:00,  1.93it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.25it/s][A100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.25it/s][A100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.24it/s][A100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
+100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]
+100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]
+100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]
+
+[rank1] {'num_prompt_tokens': 14527, 'num_generated_tokens': 5751, 'num_samples': 127, 'runtime': 210.92175040207803, 'samples/s': 0.6021190311473386, 'tokens/s': 27.266035812034207}
+100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]
+
+
+[rank6] {'num_prompt_tokens': 13761, 'num_generated_tokens': 4429, 'num_samples': 126, 'runtime': 210.92097748443484, 'samples/s': 0.5973801254988889, 'tokens/s': 20.99838552249666}
+[rank3] {'num_prompt_tokens': 14180, 'num_generated_tokens': 5535, 'num_samples': 127, 'runtime': 210.9082309603691, 'samples/s': 0.6021576276170276, 'tokens/s': 26.243641487088567}
+[rank2] {'num_prompt_tokens': 15360, 'num_generated_tokens': 4689, 'num_samples': 127, 'runtime': 210.92038844898343, 'samples/s': 0.6021229191445295, 'tokens/s': 22.231136754871642}
+[rank7] {'num_prompt_tokens': 12607, 'num_generated_tokens': 5111, 'num_samples': 126, 'runtime': 210.92071105912328, 'samples/s': 0.5973808800819038, 'tokens/s': 24.231854588084207}[rank4] {'num_prompt_tokens': 14537, 'num_generated_tokens': 5343, 'num_samples': 126, 'runtime': 210.92229717224836, 'samples/s': 0.5973763878415514, 'tokens/s': 25.331603493947693}
+
+[rank5] {'num_prompt_tokens': 13718, 'num_generated_tokens': 5289, 'num_samples': 126, 'runtime': 210.90830744989216, 'samples/s': 0.5974160123111093, 'tokens/s': 25.07724832629728}
+100%|██████████| 1012/1012 [03:30<00:00,  4.81it/s]100%|██████████| 1012/1012 [03:30<00:00,  4.80it/s]
+[rank0] {'num_prompt_tokens': 15593, 'num_generated_tokens': 4972, 'num_samples': 127, 'runtime': 210.9368197787553, 'samples/s': 0.6020760156202513, 'tokens/s': 23.57103897373141}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 23:12:33.179311
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=sw
++ lp=zh2sw
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/train.log
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/generated_predictions.jsonl
+[2025-09-15 23:13:06,484] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 23:13:13,455] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:13:13,570] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:13:13,699] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:13:13,744] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:13:13,830] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 23:13:13,921] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:13:13,927] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:13:13,934] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2sw.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|��████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.03s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.13s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.21s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 23:13:17.900610
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [01:32<10:27,  5.75s/it][A
+ 13%|█▎        | 16/125 [01:32<10:32,  5.80s/it][A
+ 13%|█▎        | 16/125 [01:33<10:36,  5.84s/it][A
+ 13%|█▎        | 16/125 [01:33<10:36,  5.84s/it][A
+ 13%|█▎        | 16/125 [01:33<10:39,  5.86s/it][A
+ 13%|█▎        | 16/125 [01:34<10:43,  5.90s/it][A
+ 13%|█▎        | 16/125 [01:35<10:47,  5.94s/it][A
+ 13%|█▎        | 16/125 [01:35<10:53,  5.99s/it][A
+ 26%|██▌       | 32/125 [01:36<03:57,  2.55s/it][A
+ 26%|██▌       | 32/125 [03:02<08:48,  5.69s/it][A
+ 26%|██▌       | 32/125 [03:04<08:55,  5.75s/it][A
+ 26%|██▌       | 32/125 [03:06<09:01,  5.82s/it][A
+ 26%|██▌       | 32/125 [03:06<09:02,  5.84s/it][A
+ 38%|███▊      | 48/125 [03:07<05:05,  3.97s/it][A
+ 26%|██▌       | 32/125 [03:09<09:08,  5.90s/it][A
+ 26%|██▌       | 32/125 [03:10<09:13,  5.95s/it][A
+ 26%|██▌       | 32/125 [03:11<09:15,  5.97s/it][A
+ 38%|███▊      | 48/125 [04:32<07:15,  5.66s/it][A
+ 38%|███▊      | 48/125 [04:35<07:19,  5.71s/it][A
+ 51%|█████     | 64/125 [04:38<04:43,  4.65s/it][A
+ 38%|███▊      | 48/125 [04:39<07:27,  5.82s/it][A
+ 38%|███▊      | 48/125 [04:41<07:32,  5.88s/it][A
+ 38%|███▊      | 48/125 [04:41<07:31,  5.86s/it][A
+ 38%|███▊      | 48/125 [04:43<07:32,  5.87s/it][A
+ 38%|███▊      | 48/125 [04:43<07:34,  5.91s/it][A
+ 51%|█████     | 64/125 [06:04<05:46,  5.68s/it][A
+ 51%|█████     | 64/125 [06:07<05:50,  5.74s/it][A
+ 64%|██████▍   | 80/125 [06:09<03:46,  5.03s/it][A
+ 51%|█████     | 64/125 [06:11<05:54,  5.81s/it][A
+ 51%|█████     | 64/125 [06:14<05:56,  5.85s/it][A
+ 51%|█████     | 64/125 [06:14<05:55,  5.83s/it][A
+ 51%|█████     | 64/125 [06:16<05:57,  5.86s/it][A
+ 51%|█████     | 64/125 [06:18<06:00,  5.90s/it][A
+ 64%|██████▍   | 80/125 [07:35<04:16,  5.70s/it][A
+ 64%|██████▍   | 80/125 [07:39<04:18,  5.74s/it][A
+ 77%|███████▋  | 96/125 [07:40<02:32,  5.26s/it][A
+ 64%|██████▍   | 80/125 [07:44<04:21,  5.80s/it][A
+ 64%|██████▍   | 80/125 [07:48<04:22,  5.84s/it][A
+ 64%|██████▍   | 80/125 [07:49<04:24,  5.88s/it][A
+ 64%|██████▍   | 80/125 [07:50<04:23,  5.86s/it][A
+ 64%|██████▍   | 80/125 [07:52<04:25,  5.90s/it][A
+ 77%|███████▋  | 96/125 [09:08<02:46,  5.73s/it][A
+ 77%|███████▋  | 96/125 [09:09<02:45,  5.70s/it][A
+ 90%|████████▉ | 112/125 [09:11<01:09,  5.38s/it][A
+ 77%|███████▋  | 96/125 [09:16<02:47,  5.79s/it][A
+ 77%|███████▋  | 96/125 [09:21<02:49,  5.84s/it][A
+ 77%|███████▋  | 96/125 [09:22<02:49,  5.85s/it][A
+ 77%|███████▋  | 96/125 [09:23<02:49,  5.86s/it][A
+ 77%|███████▋  | 96/125 [09:25<02:50,  5.87s/it][A
+100%|██████████| 125/125 [10:29<00:00,  5.57s/it][A100%|██████████| 125/125 [10:29<00:00,  5.04s/it]
+
+ 90%|████████▉ | 112/125 [10:39<01:14,  5.71s/it][A
+ 90%|████████▉ | 112/125 [10:39<01:13,  5.68s/it][A
+ 90%|████████▉ | 112/125 [10:51<01:15,  5.83s/it][A
+ 90%|████████▉ | 112/125 [10:55<01:15,  5.84s/it][A
+ 90%|████████▉ | 112/125 [10:56<01:15,  5.83s/it][A
+ 90%|████████▉ | 112/125 [10:56<01:16,  5.87s/it][A
+ 90%|████████▉ | 112/125 [11:00<01:16,  5.88s/it][A
+100%|██████████| 125/125 [11:58<00:00,  5.78s/it][A100%|██████████| 125/125 [11:58<00:00,  5.75s/it]
+
+100%|██████████| 125/125 [11:58<00:00,  5.83s/it][A100%|██████████| 125/125 [11:58<00:00,  5.75s/it]
+
+100%|██████████| 125/125 [12:11<00:00,  5.92s/it][A100%|██████████| 125/125 [12:11<00:00,  5.85s/it]
+
+100%|██████████| 125/125 [12:16<00:00,  5.96s/it][A100%|██████████| 125/125 [12:16<00:00,  5.89s/it]
+
+100%|██████████| 125/125 [12:16<00:00,  5.93s/it][A100%|██████████| 125/125 [12:16<00:00,  5.90s/it]
+
+100%|██████████| 125/125 [12:18<00:00,  5.99s/it][A100%|██████████| 125/125 [12:18<00:00,  5.91s/it]
+
+100%|██████████| 125/125 [12:22<00:00,  6.00s/it][A100%|██████████| 125/125 [12:22<00:00,  5.94s/it]
+ 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s]
+
+  0%|          | 0/2 [00:00<?, ?it/s]  0%|          | 0/2 [00:00<?, ?it/s][A[A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s][A[A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [12:22<00:08,  1.35it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:01<00:00,  1.10s/it][A100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.25s/it][A100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.53s/it][A100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
+
+100%|██████████| 1/1 [00:01<00:00,  1.57s/it][A100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
+
+100%|██████████| 2/2 [00:02<00:00,  1.40s/it][A100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
+
+100%|██████████| 2/2 [00:03<00:00,  1.53s/it][A100%|██████████| 2/2 [00:03<00:00,  1.53s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.23s/it][A100%|██████████| 2/2 [00:44<00:00, 22.23s/it]
+
+100%|██████████| 2/2 [00:45<00:00, 22.99s/it][A100%|██████████| 2/2 [00:45<00:00, 22.99s/it]
+100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]
+100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s]
+100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]
+[rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 32291, 'num_samples': 127, 'runtime': 788.3729918673635, 'samples/s': 0.16109126176327282, 'tokens/s': 40.959038847227106}
+100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s][rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 28185, 'num_samples': 127, 'runtime': 788.3745605628937, 'samples/s': 0.16109094122636697, 'tokens/s': 35.75077305878073}
+100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]
+
+[rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 28727, 'num_samples': 126, 'runtime': 788.3682138193399, 'samples/s': 0.15982379526640048, 'tokens/s': 36.43855687791973}
+100%|██████████| 1012/1012 [13:08<00:00,  1.26it/s][rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 32273, 'num_samples': 126, 'runtime': 788.3519282322377, 'samples/s': 0.15982709686844596, 'tokens/s': 40.937300771709175}
+100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]
+
+100%|██████████| 1012/1012 [13:08<00:00,  1.28it/s]
+[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 29494, 'num_samples': 127, 'runtime': 788.3745529074222, 'samples/s': 0.16109094279063246, 'tokens/s': 37.411151706038694}
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 35534, 'num_samples': 127, 'runtime': 788.3893212210387, 'samples/s': 0.161087925193235, 'tokens/s': 45.071640423751276}[rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 31227, 'num_samples': 126, 'runtime': 788.374878777191, 'samples/s': 0.15982244410861027, 'tokens/s': 39.60932906491724}
+
+[rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 28335, 'num_samples': 126, 'runtime': 788.3720416519791, 'samples/s': 0.15982301926381828, 'tokens/s': 35.94115278444675}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 23:26:29.938528
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' si = zh ']'
++ src_lang=si
++ tgt_lang=zh
++ lp=si2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/train.log
+[2025-09-15 23:27:02,988] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 23:27:09,983] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:27:10,071] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:27:10,235] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:27:10,318] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:27:10,404] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:27:10,417] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:27:10,463] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:27:10,470] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.si2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.44s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.43s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.46s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 23:27:14.452389
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.33it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.25it/s][A
+ 13%|█▎        | 16/125 [00:04<00:27,  3.99it/s][A
+ 13%|█▎        | 16/125 [00:04<00:29,  3.70it/s][A
+ 13%|█▎        | 16/125 [00:05<00:34,  3.20it/s][A
+ 26%|██▌       | 32/125 [00:05<00:16,  5.78it/s][A
+ 13%|█▎        | 16/125 [00:06<00:46,  2.36it/s][A
+ 26%|██▌       | 32/125 [00:07<00:21,  4.35it/s][A
+ 26%|██▌       | 32/125 [00:08<00:23,  3.91it/s][A
+ 26%|██▌       | 32/125 [00:08<00:25,  3.65it/s][A
+ 38%|███▊      | 48/125 [00:09<00:15,  4.84it/s][A
+ 38%|███▊      | 48/125 [00:11<00:19,  3.99it/s][A
+ 38%|███▊      | 48/125 [00:11<00:18,  4.10it/s][A
+ 26%|██▌       | 32/125 [00:12<00:36,  2.54it/s][A
+ 51%|█████     | 64/125 [00:13<00:13,  4.58it/s][A
+ 26%|██▌       | 32/125 [00:14<00:42,  2.19it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.28it/s][A
+ 38%|███▊      | 48/125 [00:15<00:24,  3.11it/s][A
+ 38%|███▊      | 48/125 [00:18<00:26,  2.86it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:10,  4.41it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:11,  3.82it/s][A
+ 51%|█████     | 64/125 [00:19<00:17,  3.56it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  4.11it/s][A
+ 64%|██████▍   | 80/125 [00:24<00:12,  3.53it/s][A
+ 51%|█████     | 64/125 [00:24<00:23,  2.61it/s][A
+ 90%|████████▉ | 112/125 [00:27<00:03,  4.15it/s][A
+100%|██████████| 125/125 [00:30<00:00,  3.99it/s][A100%|██████████| 125/125 [00:30<00:00,  4.08it/s]
+
+ 64%|██████▍   | 80/125 [00:31<00:17,  2.51it/s][A
+ 77%|███████▋  | 96/125 [00:33<00:11,  2.62it/s][A
+ 77%|███████▋  | 96/125 [00:35<00:09,  2.95it/s][A
+ 90%|████████▉ | 112/125 [00:36<00:04,  3.14it/s][A
+100%|██████████| 125/125 [00:40<00:00,  3.06it/s][A100%|██████████| 125/125 [00:40<00:00,  3.07it/s]
+
+ 90%|████████▉ | 112/125 [00:42<00:04,  2.63it/s][A
+100%|██████████| 125/125 [00:46<00:00,  2.75it/s][A100%|██████████| 125/125 [00:46<00:00,  2.66it/s]
+
+ 13%|█▎        | 16/125 [01:50<12:30,  6.88s/it][A
+ 26%|██▌       | 32/125 [01:54<04:37,  2.99s/it][A
+ 13%|█▎        | 16/125 [02:00<13:40,  7.53s/it][A
+ 51%|█████     | 64/125 [02:02<02:55,  2.87s/it][A
+ 26%|██▌       | 32/125 [02:04<05:03,  3.26s/it][A
+ 77%|███████▋  | 96/125 [02:06<01:11,  2.46s/it][A
+ 64%|██████▍   | 80/125 [02:07<01:27,  1.96s/it][A
+ 38%|███▊      | 48/125 [02:08<02:24,  1.87s/it][A
+ 90%|████████▉ | 112/125 [02:10<00:22,  1.74s/it][A
+ 38%|███▊      | 48/125 [02:12<04:43,  3.69s/it][A
+ 77%|███████▋  | 96/125 [02:12<00:40,  1.41s/it][A
+100%|██████████| 125/125 [02:15<00:00,  1.36s/it][A100%|██████████| 125/125 [02:15<00:00,  1.08s/it]
+
+ 90%|████████▉ | 112/125 [02:17<00:13,  1.03s/it][A
+ 51%|█████     | 64/125 [02:17<01:22,  1.35s/it][A
+ 64%|██████▍   | 80/125 [02:21<00:43,  1.03it/s][A
+ 77%|███████▋  | 96/125 [02:24<00:20,  1.42it/s][A
+ 90%|████████▉ | 112/125 [02:28<00:07,  1.84it/s][A
+100%|██████████| 125/125 [02:31<00:00,  2.21it/s][A100%|██████████| 125/125 [02:31<00:00,  1.21s/it]
+
+100%|██████████| 125/125 [03:48<00:00,  2.68s/it][A100%|██████████| 125/125 [03:48<00:00,  1.83s/it]
+
+ 38%|███▊      | 48/125 [03:50<06:20,  4.94s/it][A
+ 51%|█████     | 64/125 [03:55<03:09,  3.10s/it][A
+ 51%|█████     | 64/125 [03:59<04:56,  4.86s/it][A
+ 64%|██████▍   | 80/125 [03:59<01:33,  2.08s/it][A
+ 64%|██████▍   | 80/125 [04:08<02:29,  3.32s/it][A
+ 77%|███████▋  | 96/125 [05:50<01:48,  3.74s/it][A
+ 77%|███████▋  | 96/125 [05:55<02:09,  4.47s/it][A
+ 90%|████████▉ | 112/125 [06:00<00:35,  2.71s/it][A
+ 90%|████████▉ | 112/125 [06:00<00:40,  3.12s/it][A
+100%|██████████| 125/125 [06:03<00:00,  2.04s/it][A100%|██████████| 125/125 [06:03<00:00,  2.91s/it]
+
+100%|██████████| 125/125 [06:07<00:00,  2.40s/it][A100%|██████████| 125/125 [06:07<00:00,  2.94s/it]
+ 99%|█████████▉| 1000/1012 [06:07<00:04,  2.72it/s] 99%|█████████▉| 1000/1012 [06:07<00:04,  2.72it/s] 99%|█████████▉| 1000/1012 [06:07<00:04,  2.72it/s] 99%|█████████▉| 1000/1012 [06:07<00:04,  2.72it/s] 99%|█████████▉| 1000/1012 [06:07<00:04,  2.72it/s] 99%|█████████▉| 1000/1012 [06:07<00:04,  2.72it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [06:08<00:04,  2.72it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [06:08<00:04,  2.72it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  1.62it/s][A100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.60it/s][A100%|██████████| 1/1 [00:00<00:00,  1.60it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.43it/s][A100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.35it/s][A100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A100%|██████████| 2/2 [00:01<00:00,  1.98it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.66it/s][A100%|██████████| 2/2 [00:01<00:00,  1.66it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.38it/s][A100%|██████████| 2/2 [00:01<00:00,  1.38it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.32it/s][A100%|██████████| 2/2 [00:01<00:00,  1.32it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.75it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+[rank1] {'num_prompt_tokens': 47254, 'num_generated_tokens': 7071, 'num_samples': 127, 'runtime': 369.777321504429, 'samples/s': 0.34344994301787884, 'tokens/s': 19.1223192683419}
+[rank2] {'num_prompt_tokens': 44528, 'num_generated_tokens': 3773, 'num_samples': 127, 'runtime': 369.77713259868324, 'samples/s': 0.34345011847401685, 'tokens/s': 10.203443283483981}
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s][rank7] {'num_prompt_tokens': 38819, 'num_generated_tokens': 5469, 'num_samples': 126, 'runtime': 369.75426723994315, 'samples/s': 0.34076685832603343, 'tokens/s': 14.790904350675211}
+[rank3] {'num_prompt_tokens': 43051, 'num_generated_tokens': 7776, 'num_samples': 127, 'runtime': 369.7766402680427, 'samples/s': 0.3434505757528128, 'tokens/s': 21.028910842943876}
+[rank4] {'num_prompt_tokens': 44578, 'num_generated_tokens': 5343, 'num_samples': 126, 'runtime': 369.7775325793773, 'samples/s': 0.34074541825483284, 'tokens/s': 14.449228331234698}
+100%|██████████| 1012/1012 [06:09<00:00,  2.75it/s]100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s]
+100%|██████████| 1012/1012 [06:09<00:00,  2.74it/s][rank6] {'num_prompt_tokens': 39153, 'num_generated_tokens': 3582, 'num_samples': 126, 'runtime': 369.7764898072928, 'samples/s': 0.34074637915910844, 'tokens/s': 9.686932778951798}
+
+[rank5] {'num_prompt_tokens': 39718, 'num_generated_tokens': 4409, 'num_samples': 126, 'runtime': 369.77724893577397, 'samples/s': 0.3407456796291022, 'tokens/s': 11.92339445622787}
+[rank0] {'num_prompt_tokens': 47610, 'num_generated_tokens': 4321, 'num_samples': 127, 'runtime': 369.79228969477117, 'samples/s': 0.3434360410943846, 'tokens/s': 11.684938059597131}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 23:33:28.084109
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=si
++ lp=zh2si
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/train.log
+[2025-09-15 23:34:01,229] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 23:34:08,175] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:34:08,318] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:34:08,474] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 23:34:08,557] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 23:34:08,639] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:34:08,659] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:34:08,661] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:34:08,671] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2si.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.18s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.39s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.00it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 23:34:12.703484
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [01:32<10:31,  5.79s/it][A
+ 13%|█▎        | 16/125 [01:33<10:38,  5.85s/it][A
+ 13%|█▎        | 16/125 [01:34<10:40,  5.88s/it][A
+ 13%|█▎        | 16/125 [01:34<10:40,  5.88s/it][A
+ 13%|█▎        | 16/125 [01:34<10:41,  5.88s/it][A
+ 13%|█▎        | 16/125 [01:34<10:41,  5.89s/it][A
+ 13%|█▎        | 16/125 [01:34<10:44,  5.91s/it][A
+ 13%|█▎        | 16/125 [01:35<10:49,  5.96s/it][A
+ 26%|██▌       | 32/125 [03:03<08:52,  5.73s/it][A
+ 26%|██▌       | 32/125 [03:04<08:55,  5.75s/it][A
+ 26%|██▌       | 32/125 [03:04<08:56,  5.77s/it][A
+ 26%|██▌       | 32/125 [03:07<09:03,  5.84s/it][A
+ 26%|██▌       | 32/125 [03:07<09:04,  5.86s/it][A
+ 26%|██▌       | 32/125 [03:07<09:05,  5.86s/it][A
+ 26%|██▌       | 32/125 [03:09<09:09,  5.91s/it][A
+ 26%|██▌       | 32/125 [03:09<09:09,  5.91s/it][A
+ 38%|███▊      | 48/125 [03:25<04:42,  3.67s/it][A
+ 38%|��██▊      | 48/125 [04:34<07:18,  5.70s/it][A
+ 38%|███▊      | 48/125 [04:35<07:20,  5.71s/it][A
+ 38%|███▊      | 48/125 [04:35<07:19,  5.71s/it][A
+ 38%|███▊      | 48/125 [04:39<07:27,  5.81s/it][A
+ 38%|███▊      | 48/125 [04:41<07:30,  5.85s/it][A
+ 38%|███▊      | 48/125 [04:41<07:30,  5.86s/it][A
+ 38%|███▊      | 48/125 [04:42<07:33,  5.89s/it][A
+ 51%|█████     | 64/125 [04:59<04:36,  4.54s/it][A
+ 51%|█████     | 64/125 [06:05<05:47,  5.70s/it][A
+ 51%|█████     | 64/125 [06:06<05:48,  5.71s/it][A
+ 51%|█████     | 64/125 [06:07<05:50,  5.74s/it][A
+ 51%|█████     | 64/125 [06:12<05:54,  5.81s/it][A
+ 51%|█████     | 64/125 [06:13<05:53,  5.80s/it][A
+ 51%|█████     | 64/125 [06:14<05:56,  5.85s/it][A
+ 51%|█████     | 64/125 [06:15<05:57,  5.86s/it][A
+ 64%|██████▍   | 80/125 [06:32<03:45,  5.00s/it][A
+ 64%|██████▍   | 80/125 [07:37<04:16,  5.71s/it][A
+ 64%|██████▍   | 80/125 [07:37<04:17,  5.72s/it][A
+ 64%|██████▍   | 80/125 [07:39<04:17,  5.73s/it][A
+ 64%|██████▍   | 80/125 [07:45<04:21,  5.81s/it][A
+ 64%|██████▍   | 80/125 [07:45<04:20,  5.79s/it][A
+ 64%|██████▍   | 80/125 [07:48<04:23,  5.85s/it][A
+ 64%|██████▍   | 80/125 [07:50<04:25,  5.89s/it][A
+ 77%|███████▋  | 96/125 [08:04<02:32,  5.25s/it][A
+ 77%|███████▋  | 96/125 [09:09<02:45,  5.72s/it][A
+ 77%|███████▋  | 96/125 [09:09<02:45,  5.70s/it][A
+ 77%|███████▋  | 96/125 [09:10<02:46,  5.75s/it][A
+ 77%|███████▋  | 96/125 [09:18<02:48,  5.81s/it][A
+ 77%|███████▋  | 96/125 [09:18<02:48,  5.80s/it][A
+ 77%|███████▋  | 96/125 [09:20<02:48,  5.82s/it][A
+ 77%|███████▋  | 96/125 [09:22<02:49,  5.85s/it][A
+ 90%|████████▉ | 112/125 [09:36<01:10,  5.43s/it][A
+ 90%|████████▉ | 112/125 [10:39<01:13,  5.67s/it][A
+ 90%|████████▉ | 112/125 [10:40<01:14,  5.72s/it][A
+ 90%|████████▉ | 112/125 [10:42<01:14,  5.74s/it][A
+ 90%|████████▉ | 112/125 [10:49<01:14,  5.77s/it][A
+ 90%|████████▉ | 112/125 [10:50<01:15,  5.79s/it][A
+ 90%|████████▉ | 112/125 [10:56<01:16,  5.88s/it][A
+100%|██████████| 125/125 [10:57<00:00,  5.63s/it][A100%|██████████| 125/125 [10:57<00:00,  5.26s/it]
+
+ 90%|████████▉ | 112/125 [10:57<01:16,  5.87s/it][A
+100%|██████████| 125/125 [11:57<00:00,  5.76s/it][A100%|██████████| 125/125 [11:57<00:00,  5.74s/it]
+
+100%|██████████| 125/125 [12:00<00:00,  5.82s/it][A100%|██████████| 125/125 [12:00<00:00,  5.76s/it]
+
+100%|██████████| 125/125 [12:01<00:00,  5.85s/it][A100%|██████████| 125/125 [12:01<00:00,  5.78s/it]
+
+100%|██████████| 125/125 [12:10<00:00,  5.88s/it][A100%|██████████| 125/125 [12:10<00:00,  5.84s/it]
+
+100%|██████████| 125/125 [12:10<00:00,  5.90s/it][A100%|██████████| 125/125 [12:10<00:00,  5.84s/it]
+
+100%|██████████| 125/125 [12:17<00:00,  5.98s/it][A100%|██████████| 125/125 [12:17<00:00,  5.90s/it]
+
+100%|██████████| 125/125 [12:20<00:00,  6.01s/it][A100%|██████████| 125/125 [12:20<00:00,  5.92s/it]
+ 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s] 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+
+  0%|          | 0/1 [00:00<?, ?it/s][A  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [12:20<00:08,  1.35it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:03<00:00,  3.89s/it][A100%|██████████| 1/1 [00:03<00:00,  3.89s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.62s/it][A100%|██████████| 1/1 [00:04<00:00,  4.62s/it]
+
+100%|██████████| 2/2 [00:08<00:00,  4.24s/it][A100%|██████████| 2/2 [00:08<00:00,  4.24s/it]
+
+100%|██████████| 2/2 [00:09<00:00,  4.97s/it][A100%|██████████| 2/2 [00:09<00:00,  4.97s/it]
+
+100%|██████████| 1/1 [00:26<00:00, 26.51s/it][A100%|██████████| 1/1 [00:26<00:00, 26.51s/it]
+
+100%|██████████| 1/1 [00:27<00:00, 27.68s/it][A100%|██████████| 1/1 [00:27<00:00, 27.68s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.02s/it][A100%|██████████| 2/2 [00:44<00:00, 22.02s/it]
+
+100%|██████████| 2/2 [00:45<00:00, 22.89s/it][A100%|██████████| 2/2 [00:45<00:00, 22.89s/it]
+100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]
+100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]
+100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]
+100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s]
+[rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 49307, 'num_samples': 127, 'runtime': 786.0625100638717, 'samples/s': 0.1615647590033019, 'tokens/s': 62.726563560439416}
+100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]
+[rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 53489, 'num_samples': 126, 'runtime': 786.0644822716713, 'samples/s': 0.1602921933781677, 'tokens/s': 68.046580409562}
+[rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 49372, 'num_samples': 127, 'runtime': 786.0633747410029, 'samples/s': 0.1615645812805421, 'tokens/s': 62.809185094353744}
+[rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 55330, 'num_samples': 126, 'runtime': 786.0648722741753, 'samples/s': 0.16029211384992645, 'tokens/s': 70.38859253425738}
+100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]
+
+[rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 46740, 'num_samples': 126, 'runtime': 786.0635142903775, 'samples/s': 0.16029239076659993, 'tokens/s': 59.46084400341969}
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 46365, 'num_samples': 127, 'runtime': 786.0792586766183, 'samples/s': 0.16156131662067674, 'tokens/s': 58.982601930060454}
+100%|██████████| 1012/1012 [13:06<00:00,  1.26it/s][rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 51481, 'num_samples': 126, 'runtime': 786.0419291164726, 'samples/s': 0.1602967924899714, 'tokens/s': 65.49396169981124}
+100%|██████████| 1012/1012 [13:06<00:00,  1.29it/s]
+[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 53852, 'num_samples': 127, 'runtime': 786.0628248155117, 'samples/s': 0.1615646943102885, 'tokens/s': 68.50851903935163}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 23:47:22.107659
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
++ for lang in en de ru bn hi th jv sw si km
++ for src in $lang zh
++ '[' km = zh ']'
++ src_lang=km
++ tgt_lang=zh
++ lp=km2zh
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/train.log
+[2025-09-15 23:47:55,272] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 23:48:02,275] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:48:02,281] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:48:02,373] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:48:02,558] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:48:02,615] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:48:02,626] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-15 23:48:02,688] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:48:02,690] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.km2zh.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.19s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.48s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.26s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.08s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.25s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.27s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.28s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 23:48:07.052034
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:02<00:19,  5.63it/s][A
+ 13%|█▎        | 16/125 [00:03<00:22,  4.80it/s][A
+ 13%|█▎        | 16/125 [00:03<00:25,  4.29it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.88it/s][A
+ 13%|█▎        | 16/125 [00:04<00:28,  3.84it/s][A
+ 13%|█▎        | 16/125 [00:04<00:30,  3.53it/s][A
+ 13%|█▎        | 16/125 [00:05<00:40,  2.68it/s][A
+ 26%|██▌       | 32/125 [00:06<00:18,  5.13it/s][A
+ 26%|██▌       | 32/125 [00:06<00:19,  4.75it/s][A
+ 13%|█▎        | 16/125 [00:07<00:47,  2.28it/s][A
+ 26%|██▌       | 32/125 [00:07<00:20,  4.63it/s][A
+ 26%|██▌       | 32/125 [00:08<00:24,  3.83it/s][A
+ 26%|██▌       | 32/125 [00:09<00:26,  3.49it/s][A
+ 26%|██▌       | 32/125 [00:09<00:27,  3.44it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.63it/s][A
+ 26%|██▌       | 32/125 [00:10<00:31,  2.93it/s][A
+ 38%|███▊      | 48/125 [00:10<00:16,  4.69it/s][A
+ 38%|███▊      | 48/125 [00:12<00:20,  3.81it/s][A
+ 38%|███▊      | 48/125 [00:13<00:20,  3.84it/s][A
+ 26%|██▌       | 32/125 [00:13<00:39,  2.36it/s][A
+ 38%|███▊      | 48/125 [00:14<00:22,  3.39it/s][A
+ 51%|█████     | 64/125 [00:14<00:13,  4.42it/s][A
+ 51%|█████     | 64/125 [00:14<00:14,  4.12it/s][A
+ 38%|███▊      | 48/125 [00:15<00:25,  3.03it/s][A
+ 51%|█████     | 64/125 [00:15<00:14,  4.29it/s][A
+ 38%|███▊      | 48/125 [00:16<00:24,  3.11it/s][A
+ 51%|█████     | 64/125 [00:17<00:16,  3.81it/s][A
+ 64%|██████▍   | 80/125 [00:18<00:11,  4.08it/s][A
+ 51%|█████     | 64/125 [00:18<00:17,  3.39it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:10,  4.41it/s][A
+ 64%|██████▍   | 80/125 [00:19<00:11,  3.89it/s][A
+ 51%|█████     | 64/125 [00:20<00:19,  3.12it/s][A
+ 64%|██████▍   | 80/125 [00:22<00:12,  3.60it/s][A
+ 77%|███████▋  | 96/125 [00:22<00:07,  3.93it/s][A
+ 51%|█████     | 64/125 [00:23<00:21,  2.85it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:06,  4.15it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:12,  3.50it/s][A
+ 77%|███████▋  | 96/125 [00:23<00:07,  3.80it/s][A
+ 64%|██████▍   | 80/125 [00:23<00:13,  3.28it/s][A
+ 77%|███████▋  | 96/125 [00:26<00:07,  3.86it/s][A
+ 90%|████████▉ | 112/125 [00:28<00:03,  3.77it/s][A
+ 77%|███████▋  | 96/125 [00:28<00:08,  3.42it/s][A
+ 64%|██████▍   | 80/125 [00:29<00:16,  2.81it/s][A
+ 90%|████████▉ | 112/125 [00:29<00:02,  4.37it/s][A
+100%|██████████| 125/125 [00:31<00:00,  3.86it/s][A100%|██████████| 125/125 [00:31<00:00,  3.98it/s]
+
+100%|██████████| 125/125 [00:32<00:00,  4.46it/s][A100%|██████████| 125/125 [00:32<00:00,  3.89it/s]
+
+ 77%|███████▋  | 96/125 [00:32<00:11,  2.52it/s][A
+ 77%|███████▋  | 96/125 [00:32<00:09,  3.20it/s][A
+ 90%|████████▉ | 112/125 [00:35<00:04,  2.93it/s][A
+ 90%|████████▉ | 112/125 [00:35<00:04,  3.00it/s][A
+ 90%|████████▉ | 112/125 [00:38<00:04,  2.98it/s][A
+100%|██████████| 125/125 [00:39<00:00,  3.08it/s][A100%|████���█████| 125/125 [00:39<00:00,  3.15it/s]
+
+100%|██████████| 125/125 [00:42<00:00,  3.01it/s][A100%|██████████| 125/125 [00:42<00:00,  2.91it/s]
+
+ 38%|███▊      | 48/125 [02:03<04:25,  3.45s/it][A
+ 51%|█████     | 64/125 [02:08<02:14,  2.20s/it][A
+100%|██████████| 125/125 [02:09<00:00,  2.24s/it][A100%|██████████| 125/125 [02:09<00:00,  1.04s/it]
+
+ 90%|████████▉ | 112/125 [02:12<00:31,  2.42s/it][A
+ 90%|████████▉ | 112/125 [02:13<00:31,  2.41s/it][A
+ 64%|██████▍   | 80/125 [02:14<01:09,  1.54s/it][A
+100%|██████████| 125/125 [02:16<00:00,  1.81s/it][A100%|██████████| 125/125 [02:16<00:00,  1.09s/it]
+
+100%|██████████| 125/125 [02:16<00:00,  1.83s/it][A100%|██████████| 125/125 [02:16<00:00,  1.09s/it]
+
+ 77%|███████▋  | 96/125 [02:17<00:31,  1.10s/it][A
+ 90%|████████▉ | 112/125 [02:22<00:10,  1.21it/s][A
+100%|██████████| 125/125 [02:27<00:00,  1.42it/s][A100%|██████████| 125/125 [02:27<00:00,  1.18s/it]
+ 99%|█████████▉| 1000/1012 [02:27<00:01,  6.79it/s] 99%|█████████▉| 1000/1012 [02:27<00:01,  6.79it/s] 99%|█████████▉| 1000/1012 [02:27<00:01,  6.79it/s] 99%|█████████▉| 1000/1012 [02:27<00:01,  6.79it/s] 99%|█████████▉| 1000/1012 [02:27<00:01,  6.79it/s] 99%|█████████▉| 1000/1012 [02:27<00:01,  6.79it/s]
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+
+  0%|          | 0/2 [00:00<?, ?it/s][A  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:27<00:01,  6.78it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [02:27<00:01,  6.78it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:00<00:00,  2.32it/s][A100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.84it/s][A100%|██████████| 1/1 [00:00<00:00,  1.84it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.62it/s][A100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
+
+100%|██████████| 1/1 [00:00<00:00,  1.20it/s][A100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.82it/s][A100%|██████████| 2/2 [00:01<00:00,  1.82it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.70it/s][A100%|██████████| 2/2 [00:01<00:00,  1.70it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.40it/s][A100%|██████████| 2/2 [00:01<00:00,  1.40it/s]
+
+100%|██████████| 2/2 [00:01<00:00,  1.24it/s][A100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]
+
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]
+[rank7] {'num_prompt_tokens': 37707, 'num_generated_tokens': 4313, 'num_samples': 126, 'runtime': 149.06150723807514, 'samples/s': 0.8452886485225042, 'tokens/s': 28.934364611726668}
+
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]
+[rank1] {'num_prompt_tokens': 41834, 'num_generated_tokens': 4886, 'num_samples': 127, 'runtime': 149.07249221391976, 'samples/s': 0.8519345059164529, 'tokens/s': 32.775999967777864}[rank6] {'num_prompt_tokens': 35077, 'num_generated_tokens': 4408, 'num_samples': 126, 'runtime': 149.05143583007157, 'samples/s': 0.8453457646905749, 'tokens/s': 29.573683577429}
+
+[rank4] {'num_prompt_tokens': 41429, 'num_generated_tokens': 3317, 'num_samples': 126, 'runtime': 149.0535582639277, 'samples/s': 0.8453337274705849, 'tokens/s': 22.253745825555}
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s][rank3] {'num_prompt_tokens': 41276, 'num_generated_tokens': 5662, 'num_samples': 127, 'runtime': 149.07293650880456, 'samples/s': 0.8519319668228251, 'tokens/s': 37.981407843707366}
+[rank5] {'num_prompt_tokens': 37873, 'num_generated_tokens': 3485, 'num_samples': 126, 'runtime': 149.07309472933412, 'samples/s': 0.8452229440112786, 'tokens/s': 23.377793332375443}
+
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s][rank2] {'num_prompt_tokens': 45616, 'num_generated_tokens': 3776, 'num_samples': 127, 'runtime': 149.06044628471136, 'samples/s': 0.8520033527702243, 'tokens/s': 25.332005197325724}
+100%|██████████| 1012/1012 [02:29<00:00,  6.79it/s]
+[rank0] {'num_prompt_tokens': 44859, 'num_generated_tokens': 4096, 'num_samples': 127, 'runtime': 149.08781281486154, 'samples/s': 0.8518469591992045, 'tokens/s': 27.47374129826726}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-15 23:50:39.214126
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
++ for src in $lang zh
++ '[' zh = zh ']'
++ src_lang=zh
++ tgt_lang=km
++ lp=zh2km
++ src_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ test_file=/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km
++ rm -rf '/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/*'
++ cp inference.sh /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km
++ swift infer --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/generated_predictions.jsonl
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/train.log
+[2025-09-15 23:51:12,078] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/infer.py --infer_backend pt --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl --load_from_cache_file True --dataset_shuffle False --val_dataset_shuffle False --model /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best --torch_dtype bfloat16 --max_new_tokens 1024 --max_batch_size 16 --num_beams 5 --max_length 1024 --dataset_num_proc 8 --temperature 0 --result_path /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/generated_predictions.jsonl`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-15 23:51:19,072] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:51:19,141] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:51:19,193] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:51:19,305] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-15 23:51:19,561] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:51:19,614] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:51:19,637] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-15 23:51:19,638] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully loaded /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/args.json.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Setting args.eval_human: False
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: InferArguments(model='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', model_type='qwen3', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl='flash_attn', new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen3', system=None, max_length=1024, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/merge_0701/train1/test/test.zh2km.jsonl'], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=8, load_from_cache_file=True, dataset_shuffle=False, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best', lora_modules=[], tuner_backend='peft', train_type='full', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, lazy_tokenize=False, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, vllm_gpu_memory_utilization=0.9, vllm_tensor_parallel_size=1, vllm_pipeline_parallel_size=1, vllm_enable_expert_parallel=False, vllm_max_num_seqs=256, vllm_max_model_len=None, vllm_disable_custom_all_reduce=True, vllm_enforce_eager=False, vllm_limit_mm_per_prompt={}, vllm_max_lora_rank=16, vllm_enable_prefix_caching=False, vllm_use_async_engine=False, vllm_quantization=None, vllm_data_parallel_size=1, gpu_memory_utilization=None, tensor_parallel_size=None, limit_mm_per_prompt=None, data_parallel_size=None, use_async_engine=None, sglang_tp_size=1, sglang_pp_size=1, sglang_dp_size=1, sglang_ep_size=1, sglang_enable_ep_moe=False, sglang_mem_fraction_static=None, sglang_context_length=None, sglang_disable_cuda_graph=False, sglang_quantization=None, sglang_kv_cache_dtype='auto', sglang_enable_dp_attention=False, sglang_disable_custom_all_reduce=True, lmdeploy_tp=1, lmdeploy_session_len=None, lmdeploy_cache_max_entry_count=0.8, lmdeploy_quant_policy=0, lmdeploy_vision_batch_size=1, merge_lora=False, safe_serialization=True, max_shard_size='5GB', infer_backend='pt', result_path='/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/generated_predictions.jsonl', write_batch_size=1000, metric=None, max_batch_size=16, val_dataset_sample=None)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.53s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:02<00:02,  2.15s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.55s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.31s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.00s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.08s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.22s/it]Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.20s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.35s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 1024
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560, padding_idx=151643)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] Start time of running main: 2025-09-15 23:51:24.162632
+[INFO:swift] swift.__version__: 3.7.3
+[INFO:swift] request_config: RequestConfig(max_tokens=1024, temperature=0.0, top_k=None, top_p=None, repetition_penalty=None, num_beams=5, stop=[], seed=None, stream=False, logprobs=False, top_logprobs=None, n=1, best_of=None, presence_penalty=0.0, frequency_penalty=0.0, length_penalty=1.0, return_details=False)
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
+Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 1012
+})
+[INFO:swift] args.result_path: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/generated_predictions.jsonl
+  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A  0%|          | 0/1012 [00:00<?, ?it/s]  0%|          | 0/1012 [00:00<?, ?it/s]
+  0%|          | 0/125 [00:00<?, ?it/s][A
+  0%|          | 0/125 [00:00<?, ?it/s][A
+ 13%|█▎        | 16/125 [00:20<02:21,  1.30s/it][A
+ 13%|█▎        | 16/125 [01:33<10:37,  5.85s/it][A
+ 13%|█▎        | 16/125 [01:33<10:38,  5.86s/it][A
+ 13%|█▎        | 16/125 [01:34<10:40,  5.88s/it][A
+ 13%|█▎        | 16/125 [01:34<10:43,  5.91s/it][A
+ 13%|█▎        | 16/125 [01:34<10:44,  5.91s/it][A
+ 13%|█▎        | 16/125 [01:35<10:49,  5.96s/it][A
+ 13%|█▎        | 16/125 [01:36<10:54,  6.00s/it][A
+ 26%|██▌       | 32/125 [01:51<06:01,  3.88s/it][A
+ 26%|██▌       | 32/125 [01:52<04:48,  3.10s/it][A
+ 26%|██▌       | 32/125 [02:01<05:20,  3.45s/it][A
+ 26%|██▌       | 32/125 [03:05<08:56,  5.77s/it][A
+ 26%|██▌       | 32/125 [03:05<08:59,  5.80s/it][A
+ 26%|██▌       | 32/125 [03:09<09:11,  5.93s/it][A
+ 26%|██▌       | 32/125 [03:09<09:11,  5.93s/it][A
+ 26%|██▌       | 32/125 [03:10<09:12,  5.94s/it][A
+ 38%|███▊      | 48/125 [03:22<06:01,  4.69s/it][A
+ 38%|███▊      | 48/125 [03:28<05:40,  4.42s/it][A
+ 38%|███▊      | 48/125 [03:30<04:54,  3.82s/it][A
+ 38%|███▊      | 48/125 [03:35<05:48,  4.53s/it][A
+ 38%|███▊      | 48/125 [04:35<07:20,  5.73s/it][A
+ 38%|███▊      | 48/125 [04:37<07:23,  5.75s/it][A
+ 38%|███▊      | 48/125 [04:41<07:28,  5.83s/it][A
+ 38%|███▊      | 48/125 [04:42<07:32,  5.87s/it][A
+ 51%|█████     | 64/125 [04:54<05:11,  5.10s/it][A
+ 51%|█████     | 64/125 [04:56<04:01,  3.96s/it][A
+ 51%|█████     | 64/125 [05:01<05:03,  4.98s/it][A
+ 51%|█████     | 64/125 [05:02<04:38,  4.57s/it][A
+ 51%|█████     | 64/125 [05:07<05:07,  5.03s/it][A
+ 51%|█████     | 64/125 [06:08<05:50,  5.75s/it][A
+ 51%|█████     | 64/125 [06:15<05:57,  5.85s/it][A
+ 51%|█████     | 64/125 [06:16<05:57,  5.86s/it][A
+ 64%|██████▍   | 80/125 [06:25<03:59,  5.33s/it][A
+ 64%|██████▍   | 80/125 [06:28<03:27,  4.60s/it][A
+ 64%|██████▍   | 80/125 [06:34<03:07,  4.16s/it][A
+ 64%|██████▍   | 80/125 [06:34<03:45,  5.01s/it][A
+ 64%|██████▍   | 80/125 [06:37<04:00,  5.34s/it][A
+ 77%|███████▋  | 96/125 [06:39<01:50,  3.81s/it][A
+ 64%|██████▍   | 80/125 [06:40<03:58,  5.31s/it][A
+ 64%|██████▍   | 80/125 [07:40<04:18,  5.74s/it][A
+ 64%|██████▍   | 80/125 [07:49<04:23,  5.85s/it][A
+ 77%|███████▋  | 96/125 [07:54<01:58,  4.10s/it][A
+ 77%|███████▋  | 96/125 [08:00<02:24,  4.99s/it][A
+ 77%|███████▋  | 96/125 [08:07<02:32,  5.27s/it][A
+ 77%|███████▋  | 96/125 [08:08<02:17,  4.75s/it][A
+ 77%|███████▋  | 96/125 [08:09<02:39,  5.49s/it][A
+ 90%|████████▉ | 112/125 [08:11<00:57,  4.43s/it][A
+ 77%|███████▋  | 96/125 [08:12<02:38,  5.46s/it][A
+ 90%|████████▉ | 112/125 [08:21<00:49,  3.79s/it][A
+100%|██████████| 125/125 [08:24<00:00,  3.50s/it][A100%|██████████| 125/125 [08:24<00:00,  4.04s/it]
+
+ 77%|███████▋  | 96/125 [09:21<02:48,  5.82s/it][A
+ 90%|████████▉ | 112/125 [09:24<00:59,  4.60s/it][A
+ 90%|████████▉ | 112/125 [09:39<01:10,  5.42s/it][A
+100%|██████████| 125/125 [09:40<00:00,  4.43s/it][A100%|██████████| 125/125 [09:40<00:00,  4.65s/it]
+
+ 90%|████████▉ | 112/125 [09:42<01:06,  5.11s/it][A
+ 90%|████████▉ | 112/125 [09:44<01:13,  5.63s/it][A
+ 90%|████████▉ | 112/125 [09:48<01:13,  5.62s/it][A
+100%|██████████| 125/125 [10:43<00:00,  4.99s/it][A100%|██████████| 125/125 [10:43<00:00,  5.14s/it]
+
+ 90%|████████▉ | 112/125 [10:54<01:15,  5.83s/it][A
+100%|██████████| 125/125 [10:59<00:00,  5.63s/it][A100%|██████████| 125/125 [10:59<00:00,  5.27s/it]
+
+100%|██████████| 125/125 [11:04<00:00,  5.44s/it][A100%|██████████| 125/125 [11:04<00:00,  5.31s/it]
+
+100%|██████████| 125/125 [11:07<00:00,  5.82s/it][A100%|██████████| 125/125 [11:07<00:00,  5.34s/it]
+
+100%|██████████| 125/125 [11:08<00:00,  5.77s/it][A100%|██████████| 125/125 [11:08<00:00,  5.35s/it]
+
+100%|██████████| 125/125 [11:09<00:00,  4.54s/it][A100%|██████████| 125/125 [11:09<00:00,  5.36s/it]
+ 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s] 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s] 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s] 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s] 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s] 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s] 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/1 [00:00<?, ?it/s][A
+  0%|          | 0/2 [00:00<?, ?it/s][A 99%|█████████▉| 1000/1012 [11:09<00:08,  1.49it/s]
+  0%|          | 0/2 [00:00<?, ?it/s][A
+100%|██████████| 1/1 [00:03<00:00,  3.36s/it][A100%|██████████| 1/1 [00:03<00:00,  3.36s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.33s/it][A100%|██████████| 1/1 [00:04<00:00,  4.33s/it]
+
+100%|██████████| 1/1 [00:04<00:00,  4.42s/it][A100%|██████████| 1/1 [00:04<00:00,  4.42s/it]
+
+100%|██████████| 1/1 [00:05<00:00,  5.17s/it][A100%|██████████| 1/1 [00:05<00:00,  5.17s/it]
+
+100%|██████████| 2/2 [00:06<00:00,  3.24s/it][A100%|██████████| 2/2 [00:06<00:00,  3.24s/it]
+
+100%|██████████| 2/2 [00:09<00:00,  4.84s/it][A100%|██████████| 2/2 [00:09<00:00,  4.84s/it]
+
+100%|██████████| 2/2 [00:44<00:00, 22.21s/it][A100%|██████████| 2/2 [00:44<00:00, 22.21s/it]
+
+100%|██████████| 2/2 [00:45<00:00, 22.83s/it][A100%|██████████| 2/2 [00:45<00:00, 22.84s/it]
+100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s]100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s]100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s]100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s][rank1] {'num_prompt_tokens': 10670, 'num_generated_tokens': 36518, 'num_samples': 127, 'runtime': 715.6073268949986, 'samples/s': 0.17747163175515498, 'tokens/s': 51.03077990893504}
+[rank3] {'num_prompt_tokens': 10473, 'num_generated_tokens': 39697, 'num_samples': 127, 'runtime': 715.5843489579856, 'samples/s': 0.1774773304991004, 'tokens/s': 55.47494164427392}
+100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s]100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s][rank7] {'num_prompt_tokens': 8849, 'num_generated_tokens': 32926, 'num_samples': 126, 'runtime': 715.6078144684434, 'samples/s': 0.17607409736517948, 'tokens/s': 46.011235951157936}
+100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s]100%|██████████| 1012/1012 [11:55<00:00,  1.38it/s][rank5] {'num_prompt_tokens': 9151, 'num_generated_tokens': 30701, 'num_samples': 126, 'runtime': 715.5781205836684, 'samples/s': 0.17608140379868917, 'tokens/s': 42.90377125415521}
+100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+
+100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+100%|██████████| 1012/1012 [11:55<00:00,  1.41it/s]
+[rank2] {'num_prompt_tokens': 10163, 'num_generated_tokens': 34575, 'num_samples': 127, 'runtime': 715.607663532719, 'samples/s': 0.17747154826856226, 'tokens/s': 48.31558095579165}
+[rank0] {'num_prompt_tokens': 10953, 'num_generated_tokens': 36823, 'num_samples': 127, 'runtime': 715.622737525031, 'samples/s': 0.17746780997935774, 'tokens/s': 51.455883203699926}[rank4] {'num_prompt_tokens': 8955, 'num_generated_tokens': 30276, 'num_samples': 126, 'runtime': 715.608059566468, 'samples/s': 0.17607403705924404, 'tokens/s': 42.30807576194978}
+
+[rank6] {'num_prompt_tokens': 9153, 'num_generated_tokens': 42967, 'num_samples': 126, 'runtime': 715.6061938088387, 'samples/s': 0.17607449612664844, 'tokens/s': 60.04280059582304}
+[INFO:swift] The inference results have been saved to result_path: `/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/generated_predictions.jsonl`.
+[INFO:swift] End time of running main: 2025-09-16 00:03:23.226715
++ jq -r .response /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/generated_predictions.jsonl
++ hypo_file=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
++ lang_pair_strs=en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km
++ src_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
++ ref_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
++ hypo_file_strs=/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
++ metric=bleu,comet_22
++ python /mnt/nvme1/luoyingfeng/llm-mt/src/mt_scoring.py --metric bleu,comet_22 --comet_22_path /mnt/nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt --xcomet_xxl_path /mnt/nvme3/luoyingfeng/model_card/XCOMET-XXL/checkpoints/model.ckpt --lang_pair en2zh,zh2en,de2zh,zh2de,ru2zh,zh2ru,bn2zh,zh2bn,hi2zh,zh2hi,th2zh,zh2th,jv2zh,zh2jv,sw2zh,zh2sw,si2zh,zh2si,km2zh,zh2km --src_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh --ref_file /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh,/mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km --hypo_file /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt,/mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt --record_file result_mt.xlsx
+[2025-09-16 00:03:41,398] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../nvme3/luoyingfeng/model_card/wmt22-comet-da/checkpoints/model.ckpt`
+Encoder model frozen.
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+You are using a CUDA device ('NVIDIA H200') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+evaluate zh2en
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt -l zh-en
+30.62
+
+evaluate zh2ru
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt -l zh-ru
+15.85
+
+evaluate zh2de
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt -l zh-de
+17.32
+
+evaluate zh2bn
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt -l zh-bn
+5.30
+
+evaluate zh2hi
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt -l zh-hi
+10.61
+
+evaluate zh2th
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt -l zh-th
+6.79
+
+evaluate zh2jv
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt -l zh-jv
+4.47
+
+evaluate zh2sw
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt -l zh-sw
+1.71
+
+evaluate zh2si
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt -l zh-si
+1.76
+
+evaluate zh2km
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt -l zh-km
+4.09
+
+evaluate en2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt -l en-zh
+44.91
+
+evaluate ru2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt -l ru-zh
+36.67
+
+evaluate de2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt -l de-zh
+38.54
+
+evaluate bn2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt -l bn-zh
+29.64
+
+evaluate hi2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt -l hi-zh
+32.33
+
+evaluate th2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt -l th-zh
+34.31
+
+evaluate jv2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt -l jv-zh
+28.23
+
+evaluate sw2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt -l sw-zh
+13.07
+
+evaluate si2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt -l si-zh
+13.61
+
+evaluate km2zh
+sacrebleu -w 2 -b /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh -i /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt -l km-zh
+20.47
+
+evaluate zh2en
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2en/hypo.zh2en.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:03,  2.11it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  3.19it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  3.58it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  3.66it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.63it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.59it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:01<00:00,  3.52it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.33it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.14it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2ru
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2ru/hypo.zh2ru.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.44it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  5.06it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.69it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:00<00:00,  4.37it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  4.11it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.93it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:01<00:00,  3.70it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.46it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.25it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2de
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2de/hypo.zh2de.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.40it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  5.42it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.93it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:00<00:00,  4.46it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  4.17it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.98it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:01<00:00,  3.73it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.48it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.27it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2bn
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2bn/hypo.zh2bn.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.18it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:01<00:03,  1.94it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:01<00:03,  1.55it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:02<00:02,  1.56it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:03<00:01,  1.51it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:04<00:01,  1.43it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:05<00:00,  1.38it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.34it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:06<00:00,  1.27it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2hi
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2hi/hypo.zh2hi.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.59it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:02,  2.44it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:01<00:02,  1.98it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  2.22it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:02<00:01,  1.96it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:03<00:01,  1.61it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:04<00:00,  1.52it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.42it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:06<00:00,  1.27it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2th
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2th/hypo.zh2th.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.84it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:01<00:03,  1.65it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:01<00:02,  1.98it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  2.24it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:02<00:01,  2.39it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:02<00:00,  2.50it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:03<00:00,  1.95it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:04<00:00,  1.65it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.41it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2jv
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2jv/hypo.zh2jv.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.63it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:01<00:03,  1.63it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:02<00:03,  1.29it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:03<00:03,  1.16it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:04<00:02,  1.10it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:05<00:01,  1.05it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:06<00:00,  1.02it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:08<00:00,  1.02s/it]Predicting DataLoader 0: 100%|██████████| 8/8 [00:08<00:00,  1.12s/it]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2sw
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2sw/hypo.zh2sw.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.31it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:01<00:03,  1.61it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:02<00:03,  1.29it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:03<00:03,  1.16it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:04<00:02,  1.09it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:05<00:01,  1.05it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:06<00:00,  1.02it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:08<00:00,  1.01s/it]Predicting DataLoader 0: 100%|██████████| 8/8 [00:08<00:00,  1.12s/it]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2si
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2si/hypo.zh2si.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.29it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:02,  2.79it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:01<00:02,  2.28it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:02<00:02,  1.96it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:02<00:01,  1.83it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:03<00:01,  1.72it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:04<00:00,  1.63it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.54it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.39it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate zh2km
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/zh2km/hypo.zh2km.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  5.79it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:02,  2.07it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:01<00:03,  1.56it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:02<00:02,  1.36it/s]Predicting DataLoader 0:  62%|���█████▎   | 5/8 [00:03<00:02,  1.32it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:04<00:01,  1.21it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:06<00:00,  1.13it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:07<00:00,  1.07it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:07<00:00,  1.00it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate en2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.en
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-en/test.zh-en.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/en2zh/hypo.en2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.90it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  5.50it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.60it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:00<00:00,  4.27it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  4.06it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.91it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:01<00:00,  3.60it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.33it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.14it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate ru2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.ru
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-ru/test.zh-ru.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/ru2zh/hypo.ru2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  5.98it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.63it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.22it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:00<00:00,  4.01it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.81it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:02<00:00,  2.40it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:02<00:00,  2.48it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:03<00:00,  2.47it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:03<00:00,  2.37it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate de2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.de
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-de/test.zh-de.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/de2zh/hypo.de2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.37it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.61it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.21it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  3.93it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.79it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.56it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:02<00:00,  3.40it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.16it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  2.98it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate bn2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.bn
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-bn/test.zh-bn.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/bn2zh/hypo.bn2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  5.65it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.70it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.10it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  3.87it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.73it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.51it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:02<00:00,  3.28it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:03<00:00,  2.35it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:03<00:00,  2.25it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate hi2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.hi
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-hi/test.zh-hi.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/hi2zh/hypo.hi2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.92it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.86it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.22it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  3.74it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.49it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.42it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:02<00:00,  3.30it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.06it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  2.89it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate th2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.th
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-th/test.zh-th.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/th2zh/hypo.th2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.91it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  5.21it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.45it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:00<00:00,  4.12it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.91it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.78it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:01<00:00,  3.57it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.36it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:02<00:00,  3.15it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate jv2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.jv
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-jv/test.zh-jv.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/jv2zh/hypo.jv2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:00,  7.95it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  5.42it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.59it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:00<00:00,  4.19it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.98it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:01<00:00,  3.70it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:01<00:00,  3.58it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:03<00:00,  2.50it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:04<00:00,  1.98it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate sw2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.sw
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-sw/test.zh-sw.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/sw2zh/hypo.sw2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.89it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.83it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:01<00:02,  1.98it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:02<00:02,  1.52it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:03<00:02,  1.33it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:04<00:01,  1.43it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:04<00:00,  1.54it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.38it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:06<00:00,  1.20it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate si2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.si
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-si/test.zh-si.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/si2zh/hypo.si2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.73it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.78it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.24it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  2.17it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:02<00:01,  2.31it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:02<00:00,  2.36it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:03<00:00,  1.87it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.59it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.36it/s]
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+IPU available: False, using: 0 IPUs
+HPU available: False, using: 0 HPUs
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
+evaluate km2zh
+comet22
+src_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.km
+ref_file: /mnt/nvme1/luoyingfeng/llm-mt/data/flores200/zh-km/test.zh-km.zh
+hypo_file: /mnt/nvme1/luoyingfeng/llm-mt/exps_arr/Qwen3-4B-Base/sft_0915_0.1/base/best/decode_result/km2zh/hypo.km2zh.txt
+Predicting: 0it [00:00, ?it/s]Predicting:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s]Predicting DataLoader 0:  12%|█▎        | 1/8 [00:00<00:01,  6.54it/s]Predicting DataLoader 0:  25%|██▌       | 2/8 [00:00<00:01,  4.58it/s]Predicting DataLoader 0:  38%|███▊      | 3/8 [00:00<00:01,  4.08it/s]Predicting DataLoader 0:  50%|█████     | 4/8 [00:01<00:01,  3.80it/s]Predicting DataLoader 0:  62%|██████▎   | 5/8 [00:01<00:00,  3.53it/s]Predicting DataLoader 0:  75%|███████▌  | 6/8 [00:02<00:00,  2.26it/s]Predicting DataLoader 0:  88%|████████▊ | 7/8 [00:03<00:00,  2.28it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:04<00:00,  1.80it/s]Predicting DataLoader 0: 100%|██████████| 8/8 [00:05<00:00,  1.51it/s]
+++++ readlink -f cpt_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/cpt_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ model_name=Qwen3-14B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ train_dataset=($ROOT_DIR/data/10lang_cpt_mono_0.5B/train1.jsonl)
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl
++ per_device_train_batch_size=25
++ per_device_eval_batch_size=25
++ gradient_accumulation_steps=3
++ max_lengths=2048
++ max_steps=5000
++ task=cpt_10lang_mono
++ tag=0.5B
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ cp cpt_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ swift pt --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/train.log
+[2025-09-16 00:10:35,066] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-16 00:10:41,846] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:10:42,208] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:10:42,281] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:10:42,331] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:10:42,369] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:10:42,439] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:10:42,495] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:10:42,497] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[2025-09-16 00:10:43,406] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:10:43.255212426 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-16 00:10:43,636] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:10:43,636] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[W916 00:10:43.488148216 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s][2025-09-16 00:10:44,271] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:10:44.119673365 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl'],
+dataset_num_proc=1,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=1000.0,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=3,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=2048,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=5000,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=3.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B,
+overwrite_output_dir=False,
+packing=True,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=25,
+per_device_train_batch_size=25,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000.0,
+save_strategy=steps,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=True,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
+[INFO:swift] attn_impl: flash_attn
+[2025-09-16 00:10:44,311] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:10:44.158239108 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+[2025-09-16 00:10:44,562] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:10:44,570] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:10:44.415352279 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 00:10:44.423868541 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:10:44,585] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:10:44.436872039 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s][2025-09-16 00:10:44,603] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:10:44.449382570 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:10,  1.53s/it]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:08,  1.15s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:05,  1.21it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:06,  1.15it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:05,  1.37it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:05,  1.20it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:05,  1.32it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.14s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:05,  1.17it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:04,  1.36it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.19it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.05s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.10it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:04,  1.33it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:04,  1.21it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.01s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.16it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:03,  1.36it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.19it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:03,  1.32it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.18it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.07s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.00it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.21it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:03,  1.04it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:02<00:02,  1.34it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.18it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.29it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.17it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.24it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:02,  1.07it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:04,  1.03s/it]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:04,  1.07s/it]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:03<00:02,  1.33it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:03<00:02,  1.27it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.17it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.18it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.22it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:04<00:01,  1.33it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.10it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:04<00:01,  1.29it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.04s/it]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.07s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.17it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.18it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:04<00:01,  1.19it/s]Loading checkpoint shards:  88%|████████��� | 7/8 [00:05<00:00,  1.35it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.13it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.31it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.63it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.44it/s]
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank5]:     pt_main()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank5]:     return SwiftPt(args).main()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank5]:     result = self.run()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank5]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank5]:     train_dataset, val_dataset = self._get_dataset()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 66, in _get_dataset
+[rank5]:     train_dataset, val_dataset = load_dataset(
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank5]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank5]:     dataset = DatasetLoader._load_repo_dataset(
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank5]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank5]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl`. os.path.exists(dataset_id): False
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.18it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.37it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.12it/s]
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank6]:     pt_main()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank6]:     return SwiftPt(args).main()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank6]:     result = self.run()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank6]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank6]:     train_dataset, val_dataset = self._get_dataset()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 66, in _get_dataset
+[rank6]:     train_dataset, val_dataset = load_dataset(
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank6]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank6]:     dataset = DatasetLoader._load_repo_dataset(
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank6]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank6]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl`. os.path.exists(dataset_id): False
+Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.04s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.06s/it]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.59it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.40it/s]
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank4]:     pt_main()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank4]:     return SwiftPt(args).main()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank4]:     result = self.run()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank4]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank4]:     train_dataset, val_dataset = self._get_dataset()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 66, in _get_dataset
+[rank4]:     train_dataset, val_dataset = load_dataset(
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank4]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank4]:     dataset = DatasetLoader._load_repo_dataset(
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank4]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank4]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl`. os.path.exists(dataset_id): False
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.19it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.23it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.42it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.26it/s]
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank7]:     pt_main()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank7]:     return SwiftPt(args).main()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank7]:     result = self.run()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank7]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank7]:     train_dataset, val_dataset = self._get_dataset()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 66, in _get_dataset
+[rank7]:     train_dataset, val_dataset = load_dataset(
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank7]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank7]:     dataset = DatasetLoader._load_repo_dataset(
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank7]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank7]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl`. os.path.exists(dataset_id): False
+Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.49it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.30it/s]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.44it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.27it/s]
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank2]:     pt_main()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank2]:     return SwiftPt(args).main()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank2]:     result = self.run()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank2]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank2]:     train_dataset, val_dataset = self._get_dataset()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 66, in _get_dataset
+[rank2]:     train_dataset, val_dataset = load_dataset(
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank2]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank2]:     dataset = DatasetLoader._load_repo_dataset(
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank2]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank2]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl`. os.path.exists(dataset_id): False
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank1]:     pt_main()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank1]:     return SwiftPt(args).main()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank1]:     result = self.run()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank1]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank1]:     train_dataset, val_dataset = self._get_dataset()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 66, in _get_dataset
+[rank1]:     train_dataset, val_dataset = load_dataset(
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank1]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank1]:     dataset = DatasetLoader._load_repo_dataset(
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank1]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank1]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/train1.jsonl`. os.path.exists(dataset_id): False
+W0916 00:10:52.150000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448074 closing signal SIGTERM
+W0916 00:10:52.151000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448075 closing signal SIGTERM
+W0916 00:10:52.151000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448076 closing signal SIGTERM
+W0916 00:10:52.151000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448077 closing signal SIGTERM
+W0916 00:10:52.152000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448078 closing signal SIGTERM
+W0916 00:10:52.152000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448080 closing signal SIGTERM
+W0916 00:10:52.152000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448081 closing signal SIGTERM
+E0916 00:10:53.395000 138281103083008 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 5 (pid: 1448079) of binary: /mnt/nvme1/luoyingfeng/h200_ms/bin/python
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-16_00:10:52
+  host      : localhost
+  rank      : 5 (local_rank: 5)
+  exitcode  : 1 (pid: 1448079)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+++++ readlink -f cpt_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/cpt_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ model_name=Qwen3-14B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ train_dataset=($ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl)
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl
++ per_device_train_batch_size=25
++ per_device_eval_batch_size=25
++ gradient_accumulation_steps=3
++ max_lengths=2048
++ max_steps=5000
++ task=cpt_10lang_mono
++ tag=0.5B
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ cp cpt_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/train.log
++ swift pt --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000
+[2025-09-16 00:11:56,573] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-16 00:12:03,489] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:12:03,558] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:12:03,604] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:12:03,876] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:12:03,991] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:12:03,996] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:12:04,060] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:12:04,062] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-16 00:12:05,204] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:12:05,204] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[W916 00:12:05.051227325 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:12:05,267] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:12:05.114619895 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:12:05,286] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:12:05.134296545 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:12:05,372] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:12:05.224154298 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl'],
+dataset_num_proc=1,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=1000.0,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=3,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=2048,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=5000,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=3.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B,
+overwrite_output_dir=False,
+packing=True,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=25,
+per_device_train_batch_size=25,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000.0,
+save_strategy=steps,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=True,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s][2025-09-16 00:12:06,151] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:12:06.000983621 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:12:06,176] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:12:06,177] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:12:06,180] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:12:06.027188383 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 00:12:06.027194263 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 00:12:06.030467074 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:06,  1.16it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:08,  1.16s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:10,  1.57s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.11s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.05s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.06s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.03s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.05s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:04,  1.26it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.07it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.07s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:07,  1.26s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.09it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.10it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:03,  1.32it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.04s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.03s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.13it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.19it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.04s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.16s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.12it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.32it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.14it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.04s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.03s/it]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.05it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.17it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:03<00:02,  1.32it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.04it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:04,  1.12s/it]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.15it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:04,  1.03s/it]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:04,  1.03s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:04<00:01,  1.32it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.13it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.05it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.15it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.10s/it]Loading checkpoint shards:  62%|███��██▎   | 5/8 [00:04<00:02,  1.00it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.34it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.03s/it]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.03s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.08it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.62it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.42it/s]
+Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.11it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.16it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.09s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:02,  1.02s/it]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.40it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.21it/s]
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.15it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.17it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.03s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.02s/it]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.42it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.16it/s]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.45it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.24it/s]
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:07<00:01,  1.08s/it]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:01,  1.03s/it]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:07<00:01,  1.02s/it]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:07<00:01,  1.02s/it]Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.13it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.05s/it]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.17it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.07it/s]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17408,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 40,
+  "model_type": "qwen3",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] Setting args.use_chat_template: False
+[INFO:swift] Setting args.loss_scale: 'all'
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 2048
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: react_en
+[INFO:swift] Start time of running main: 2025-09-16 00:12:14.263847
+[INFO:swift] swift.__version__: 3.7.3
+Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.19it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.05it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.19it/s]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.04it/s]
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank3]:     pt_main()
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank3]:     return SwiftPt(args).main()
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank3]:     result = self.run()
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank3]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank3]:     train_dataset, val_dataset = self._get_dataset()
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank3]:     _, val_dataset = load_dataset(
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank3]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank3]:     dataset = DatasetLoader._load_repo_dataset(
+[rank3]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank3]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank3]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank1]:     pt_main()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank1]:     return SwiftPt(args).main()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank1]:     result = self.run()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank1]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank1]:     train_dataset, val_dataset = self._get_dataset()
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank1]:     _, val_dataset = load_dataset(
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank1]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank1]:     dataset = DatasetLoader._load_repo_dataset(
+[rank1]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank1]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank1]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank5]:     pt_main()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank5]:     return SwiftPt(args).main()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank5]:     result = self.run()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank5]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank5]:     train_dataset, val_dataset = self._get_dataset()
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank5]:     _, val_dataset = load_dataset(
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank5]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank5]:     dataset = DatasetLoader._load_repo_dataset(
+[rank5]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank5]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank5]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank0]:     pt_main()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank0]:     return SwiftPt(args).main()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank0]:     result = self.run()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank0]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank0]:     train_dataset, val_dataset = self._get_dataset()
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank0]:     _, val_dataset = load_dataset(
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank0]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank0]:     dataset = DatasetLoader._load_repo_dataset(
+[rank0]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank0]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank0]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank4]:     pt_main()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank4]:     return SwiftPt(args).main()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank4]:     result = self.run()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank4]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank4]:     train_dataset, val_dataset = self._get_dataset()
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank4]:     _, val_dataset = load_dataset(
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank4]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank4]:     dataset = DatasetLoader._load_repo_dataset(
+[rank4]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank4]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank4]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank2]:     pt_main()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank2]:     return SwiftPt(args).main()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank2]:     result = self.run()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank2]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank2]:     train_dataset, val_dataset = self._get_dataset()
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank2]:     _, val_dataset = load_dataset(
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank2]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank2]:     dataset = DatasetLoader._load_repo_dataset(
+[rank2]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank2]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank2]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank7]:     pt_main()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank7]:     return SwiftPt(args).main()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank7]:     result = self.run()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank7]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank7]:     train_dataset, val_dataset = self._get_dataset()
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank7]:     _, val_dataset = load_dataset(
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank7]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank7]:     dataset = DatasetLoader._load_repo_dataset(
+[rank7]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank7]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank7]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py", line 5, in <module>
+[rank6]:     pt_main()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/pt.py", line 24, in pt_main
+[rank6]:     return SwiftPt(args).main()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/base.py", line 49, in main
+[rank6]:     result = self.run()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 153, in run
+[rank6]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 112, in _prepare_dataset
+[rank6]:     train_dataset, val_dataset = self._get_dataset()
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/train/sft.py", line 70, in _get_dataset
+[rank6]:     _, val_dataset = load_dataset(
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank6]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 408, in load
+[rank6]:     dataset = DatasetLoader._load_repo_dataset(
+[rank6]:   File "/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank6]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank6]: ValueError: The local path does not exist, dataset_id: `/mnt/nvme1/luoyingfeng/llm-mt/data/10lang_cpt_mono_0.5B/valid.jsonl`. os.path.exists(dataset_id): False
+W0916 00:12:19.688000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448920 closing signal SIGTERM
+W0916 00:12:19.688000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448921 closing signal SIGTERM
+W0916 00:12:19.688000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448922 closing signal SIGTERM
+W0916 00:12:19.688000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448923 closing signal SIGTERM
+W0916 00:12:19.688000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448925 closing signal SIGTERM
+W0916 00:12:19.688000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1448926 closing signal SIGTERM
+E0916 00:12:20.580000 136793776154112 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 1448919) of binary: /mnt/nvme1/luoyingfeng/h200_ms/bin/python
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py FAILED
+------------------------------------------------------------
+Failures:
+[1]:
+  time      : 2025-09-16_00:12:19
+  host      : localhost
+  rank      : 5 (local_rank: 5)
+  exitcode  : 1 (pid: 1448924)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-16_00:12:19
+  host      : localhost
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 1448919)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+++++ readlink -f cpt_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/cpt_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ model_name=Qwen3-14B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ train_dataset=($ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl)
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl
++ per_device_train_batch_size=25
++ per_device_eval_batch_size=25
++ gradient_accumulation_steps=3
++ max_lengths=2048
++ max_steps=5000
++ task=cpt_10lang_mono
++ tag=0.5B
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ cp cpt_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/train.log
++ swift pt --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000
+[2025-09-16 00:12:56,513] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-16 00:13:03,590] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:13:03,602] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[2025-09-16 00:13:03,673] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:13:03,966] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:13:04,036] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:13:04,054] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:13:04,090] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:13:04,091] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[2025-09-16 00:13:05,116] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:13:05.965211210 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:13:05,333] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:13:05,335] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:13:05.182752350 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 00:13:05.185157234 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-16 00:13:05,533] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:13:05,533] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[W916 00:13:05.384427131 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s][2025-09-16 00:13:06,036] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:13:06.886468569 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:13:06,084] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:13:06.931113220 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl'],
+dataset_num_proc=1,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=1000.0,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=3,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=2048,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=5000,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=3.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B,
+overwrite_output_dir=False,
+packing=True,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=25,
+per_device_train_batch_size=25,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000.0,
+save_strategy=steps,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=True,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base
+[INFO:swift] attn_impl: flash_attn
+[2025-09-16 00:13:06,156] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:13:06,163] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:13:06.008746920 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 00:13:06.015984146 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s][INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:09,  1.33s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:06,  1.11it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:06,  1.10it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:08,  1.20s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.04s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:01<00:07,  1.10s/it]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:06,  1.03it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:06,  1.02it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.04s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:04,  1.25it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.17it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:05,  1.01it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:01<00:05,  1.05it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.03s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:04,  1.05it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:03,  1.29it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.08s/it]Loading checkpoint shards:  25%|██▌       | 2/8 [00:02<00:06,  1.05s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.17it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.09it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.16it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:02<00:04,  1.06it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.28it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.08it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.16it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.07s/it]Loading checkpoint shards:  38%|███▊      | 3/8 [00:03<00:05,  1.05s/it]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.13it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.21it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:03<00:03,  1.16it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:03<00:02,  1.28it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.12it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.16it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:04,  1.07s/it]Loading checkpoint shards:  50%|█████     | 4/8 [00:04<00:04,  1.06s/it]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.22it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.15it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:04<00:02,  1.19it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:04<00:01,  1.27it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.13it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.16it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.24it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.16it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.28it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:05<00:01,  1.19it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.07s/it]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:05<00:03,  1.07s/it]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.14it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.56it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.37it/s]
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.16it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:05<00:00,  1.28it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.38it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.17it/s]
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.18it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:06<00:00,  1.20it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.40it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.24it/s]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.55it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.31it/s]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.43it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.21it/s]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-14B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17408,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 40,
+  "model_type": "qwen3",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] Setting args.use_chat_template: False
+[INFO:swift] Setting args.loss_scale: 'all'
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 2048
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: react_en
+[INFO:swift] Start time of running main: 2025-09-16 00:13:13.066119
+[INFO:swift] swift.__version__: 3.7.3
+Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.07s/it]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:06<00:02,  1.08s/it]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.44it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.23it/s]
+Loading checkpoint shards:  88%|████████▊ | 7/8 [00:07<00:01,  1.16s/it]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:07<00:01,  1.16s/it]Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.10it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.02s/it]
+Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.09it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.01s/it]
+[INFO:swift] train_dataset: IterableDataset({
+    features: ['messages'],
+    num_shards: 1
+})
+[INFO:swift] val_dataset: IterableDataset({
+    features: ['messages'],
+    num_shards: 1
+})
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] [INPUT_IDS] [18493, 107167, 105595, 5373, 99257, 5373, 102438, 3837, 104695, 61443, 38182, 104745, 100003, 3837, 104745, 104625, 87335, 101286, 3837, 101897, 103947, 100383, 101118, 3837, 67338, 102064, 99877, 36407, 102124, 46944, 100220, 100240, 9370, 111048, 1773, 100624, 100141, 104745, 107343, 107548, 101036, 11319, 114566, 100452, 105191, 104387, 104197, 105285, 14777, 104745, 22, 99824, 3837, 100437, 101113, 3837, 109477, 100006, 99729, 8997, 26288, 105285, 14777, 104745, 10236, 107, 229, 16, 198, 2073, 53222, 111241, 104444, 3837, 99934, 99528, 49082, 3837, 99621, 16628, 99741, 3837, 50009, 108052, 854, 20412, 101988, 102376, 71268, 100645, 100854, 99195, 108008, 3837, 88308, 117159, 1773, 100632, 3837, 107954, 100090, 7948, 104890, 107935, 8997, 26288, 105285, 108739, 99391, 3837, 35946, 112181, 29490, 102300, 99830, 3837, 99621, 52801, 16628, 99741, 3837, 102231, 29490, 109979, 104781, 3837, 35946, 17447, 95256, 105611, 102440, 101187, 108040, 103167, 33108, 111877, 48921, 17881, 104006, 114339, 3837, 16872, 95256, 105611, 99378, 95256, 9370, 75437, 99413, 102693, 3837, 102586, 99619, 17447, 101347, 100434, 74040, 38035, 9370, 101433, 108040, 103167, 3837, 104402, 49828, 99795, 102020, 3837, 101228, 57566, 99226, 34187, 6313, 107967, 101077, 104334, 3837, 102208, 113305, 30709, 100343, 3837, 104038, 110272, 3837, 69041, 107723, 105883, 6313, 198, 104276, 9370, 35946, 105694, 99226, 34187, 6313, 108907, 117388, 117190, 102376, 9973, 6313, 198, 113827, 3837, 97639, 36587, 36587, 116423, 3837, 101997, 108759, 9370, 104996, 3837, 74763, 62922, 99639, 86117, 99208, 99594, 1773, 102175, 77540, 100484, 104197, 99613, 44636, 26288, 101174, 101355, 3837, 30709, 99808, 99679, 99318, 99318, 9370, 3837, 105611, 101347, 100167, 117638, 99243, 9370, 102430, 119364, 3837, 88051, 99639, 99708, 105664, 99949, 8903, 117464, 9973, 6313, 198, 104020, 26939, 107723, 34187, 3837, 35946, 102313, 29490, 112198, 104853, 39953, 3837, 115232, 108538, 28072, 67279, 3837, 111128, 110961, 105871, 110926, 100090, 7948, 34187, 6313, 198, 101140, 3837, 97639, 99495, 103088, 105943, 45629, 3837, 35946, 104398, 100688, 3837, 100549, 103088, 105943, 36987, 99477, 29524, 111903, 100523, 56006, 110194, 5691, 6313, 99945, 103755, 103755, 119392, 62, 63109, 29490, 103206, 1773, 99745, 99593, 26288, 99360, 100443, 115833, 3837, 110267, 100443, 53222, 105480, 67279, 69249, 3837, 102313, 99226, 34187, 1773, 108954, 5122, 99601, 100659, 104103, 100021, 105953, 6313, 16530, 105153, 99405, 3837, 16530, 105153, 99621, 9370, 3837, 88051, 52801, 8997, 104326, 3837, 97639, 102149, 44991, 101371, 99680, 45629, 3837, 102208, 104494, 26288, 104611, 36987, 100090, 104300, 99495, 75758, 35946, 104398, 102608, 100549, 44991, 101371, 99680, 36987, 99421, 37984, 116322, 75758, 220, 44991, 101371, 99680, 45629, 99350, 101467, 99232, 3837, 113093, 111121, 35946, 8997, 104221, 3837, 97639, 97706, 102149, 101371, 105943, 3837, 40820, 101849, 3837, 40820, 101371, 99680, 14053, 45629, 8997, 100644, 3837, 35946, 104619, 104686, 3837, 104979, 102154, 99350, 6313, 107924, 32847, 6313, 198, 112128, 99595, 100167, 100066, 3837, 102197, 99465, 99641, 99528, 100040, 1773, 33108, 99225, 51827, 99462, 100251, 3837, 108441, 103303, 16628, 101253, 1773, 104596, 108658, 9370, 116467, 69249, 3837, 105786, 117019, 33108, 30709, 103088, 101949, 100018, 107954, 3837, 100678, 36587, 26288, 105285, 104173, 99851, 99567, 9370, 101036, 11319, 14880, 49187, 35946, 101283, 44793, 36407, 8997, 99321, 38953, 26288, 105285, 14777, 101219, 111241, 104444, 70074, 15946, 106517, 3837, 101959, 106235, 101039, 112181, 111505, 99938, 26939, 118184, 3837, 53222, 100451, 30858, 100183, 1773, 35946, 44991, 16872, 75108, 20755, 40820, 99360, 101124, 100451, 30858, 100183, 104089, 33108, 117019, 111435, 105514, 3837, 97639, 105939, 75437, 79599, 32648, 33108, 100451, 30858, 100183, 3837, 99355, 99907, 99907, 9370, 104071, 118184, 3837, 101077, 53222, 8997, 107053, 105786, 35946, 103088, 99818, 99360, 100451, 30858, 100183, 28072, 99793, 3837, 99258, 117019, 111678, 108304, 9370, 13, 109703, 115257, 1773, 97639, 105779, 104525, 99164, 3837, 104995, 38182, 99593, 83031, 3837, 105786, 35946, 103088, 99818, 80158, 109482, 100451, 30858, 100183, 3837, 107053, 100451, 30858, 100183, 99283, 108270, 104291, 3837, 107172, 115257, 99225, 100307, 119607, 99610, 3837, 103952, 102837, 74763, 101067, 100451, 30858, 100183, 9370, 104291, 5691, 11319, 109148, 100254, 24071, 222, 31843, 69, 31207, 104791, 100146, 28311, 97639, 99466, 101421, 71268, 100588, 109412, 3837, 108250, 104267, 99705, 98650, 102114, 34187, 3837, 35946, 116347, 100451, 30858, 100183, 36993, 100228, 34187, 111053, 3837, 63109, 99624, 99624, 9370, 100421, 3837, 44934, 101998, 100195, 100868, 1773, 106210, 3837, 97639, 99466, 91680, 99880, 100451, 30858, 100183, 26232, 104291, 3837, 62922, 112151, 111053, 100484, 1773, 35946, 99212, 101492, 119691, 119680, 107352, 101421, 99236, 100421, 99236, 99234, 3837, 35946, 116578, 102265, 105003, 99631, 17340, 118082, 9370, 99695, 27773, 8997, 18493, 108071, 103947, 2073, 103082, 99317, 854, 15946, 3837, 35946, 103961, 105545, 111343, 106125, 3837, 101959, 104115, 36587, 36987, 111343, 106125, 3837, 110622, 100451, 30858, 100183, 9370, 100843, 99691, 64720, 38212, 3837, 102572, 81264, 107331, 97639, 101077, 106202, 13343, 3837, 104233, 117245, 99518, 114434, 113793, 104140, 105007, 2293, 2073, 100451, 30858, 100183, 99518, 101957, 101957, 104291, 34187, 75758, 2073, 110709, 107172, 99259, 100819, 116221, 3837, 854, 35946, 100180, 114551, 41505, 65101, 99259, 99180, 77959, 101891, 3837, 102636, 99561, 35946, 34187, 5691, 6313, 100458, 122113, 100027, 119128, 99869, 102984, 121285, 35496, 231, 5691, 3837, 29524, 68862, 29258, 99693, 1773, 43288, 101228, 100226, 99851, 104099, 6313, 198, 99487, 100451, 30858, 100183, 104233, 102346, 18947, 99851, 99567, 104197, 105285, 14777, 3837, 106884, 104233, 110586, 3837, 104284, 30534, 104211, 99466, 107954, 13343, 53222, 111241, 104444, 5373, 53222, 100451, 30858, 100183, 33108, 102657, 99849, 102045, 13343, 3837, 60533, 99464, 3837, 62922, 65101, 97639, 99654, 99851, 99567, 8997, 102376, 108659, 101988, 31235, 109815, 103446, 9370, 105832, 1773, 97611, 105686, 101199, 104417, 3837, 99212, 104115, 69041, 99466, 109432, 104417, 9370, 102376, 112322, 100003, 8997, 26288, 7948, 105134, 3837, 104902, 33108, 26288, 100659, 104278, 112181, 9370, 109195, 3837, 99634, 119477, 104334, 3837, 97639, 106138, 99405, 99391, 99938, 34187, 3837, 105859, 100464, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 99305, 118673, 100213, 1773, 106864, 100000, 100186, 82647, 9370, 3837, 99652, 106411, 99610, 42140, 44729, 42140, 99477, 8997, 99405, 106570, 99391, 99938, 3837, 97639, 106138, 99934, 99528, 49082, 34187, 1773, 101140, 99360, 99528, 49082, 98279, 88991, 103958, 3837, 87256, 99360, 100802, 63836, 99934, 17447, 105279, 100773, 111786, 1773, 99934, 99477, 18600, 13343, 3837, 30534, 99805, 99610, 99934, 3837, 51463, 99477, 99495, 8997, 99495, 102172, 77540, 110869, 75061, 3837, 97639, 104134, 71134, 17447, 16628, 102214, 1773, 18493, 102458, 111678, 104444, 102045, 1773, 111678, 33447, 80158, 30440, 99405, 107946, 99938, 34187, 1773, 100655, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 7948, 7948, 18830, 88683, 1773, 97706, 99992, 44793, 99929, 102100, 109861, 39426, 9370, 99800, 3837, 105139, 105863, 101425, 3837, 99652, 106411, 99610, 99832, 85361, 111226, 8997, 99495, 104030, 13343, 3837, 45629, 45629, 17523, 17523, 71268, 53222, 102300, 103850, 1773, 101916, 106519, 105231, 103850, 105484, 3837, 99212, 103850, 116943, 103585, 3837, 101280, 99226, 34187, 3837, 103973, 29732, 16530, 116435, 29077, 3837, 38182, 106570, 102376, 3837, 118581, 99518, 107651, 3837, 105666, 33108, 104902, 111656, 107847, 104432, 104066, 33108, 100134, 15946, 3837, 100549, 99466, 99257, 102088, 3837, 100134, 101300, 8997, 88308, 104197, 105285, 14777, 104037, 100654, 3837, 99519, 91777, 104128, 71618, 112096, 101280, 104197, 100167, 8997, 104301, 35946, 26288, 105285, 14777, 99774, 35727, 3837, 104624, 35946, 104053, 99774, 35727, 8997, 26288, 105285, 14777, 9370, 104030, 3837, 106047, 104071, 106954, 99366, 3837, 105786, 42411, 109216, 36987, 104335, 110138, 114154, 100003, 5691, 6313, 120144, 119557, 122204, 121285, 119773, 121059, 119142, 170, 222, 254, 5691, 8997, 97639, 92133, 104059, 99857, 99525, 110792, 24968, 11622, 102634, 101953, 99360, 63703, 27091, 71268, 101413, 34794, 3837, 114312, 100802, 63836, 1773, 99518, 11622, 72990, 100241, 99190, 2073, 121999, 44729, 854, 99360, 100050, 104957, 18493, 104135, 3837, 100420, 87256, 101358, 106804, 109703, 115257, 3837, 99654, 3837, 103952, 114154, 80158, 19108, 34187, 8997, 106356, 99314, 101180, 3837, 101952, 97639, 99190, 99692, 114154, 36587, 36987, 26288, 38182, 104300, 3837, 99243, 102284, 16530, 108738, 3837, 105603, 34187, 87256, 29258, 99190, 100003, 75758, 35946, 108954, 36987, 104389, 99232, 103554, 111031, 100854, 114154, 106041, 105603, 100373, 81264, 105420, 3837, 35946, 101144, 18947, 110721, 5122, 100561, 36407, 100369, 26288, 106099, 100822, 3837, 99360, 118579, 17447, 104165, 105151, 107495, 100194, 3837, 99934, 18493, 104135, 8997, 113573, 100644, 104458, 106356, 105511, 3837, 35946, 111256, 99822, 108032, 85336, 105275, 18947, 107000, 104150, 89012, 106356, 99405, 3837, 112696, 109785, 108927, 74763, 101161, 3837, 110567, 106356, 70927, 36407, 8997, 103083, 106356, 70927, 36407, 103920, 3837, 110267, 107000, 113121, 101255, 17447, 109703, 115257, 1773, 106356, 101161, 3837, 100549, 100523, 74763, 107651, 1773, 14777, 112594, 112594, 106578, 9370, 79599, 101233, 101286, 114177, 109003, 9370, 110914, 3837, 100261, 99331, 99164, 106356, 101954, 107288, 101625, 3837, 21, 15, 99408, 110350, 9370, 109703, 115257, 74763, 100421, 27733, 99164, 103952, 21, 15, 18947, 105514, 8997, 106356, 102203, 46306, 109703, 115257, 3837, 97639, 55286, 93149, 106800, 9370, 107000, 1773, 35946, 99677, 32648, 14777, 27733, 3837, 99360, 107000, 101913, 118509, 103077, 101537, 18493, 106356, 9370, 104488, 8997, 108668, 121769, 6313, 106356, 99518, 100514, 91777, 97706, 99914, 34187, 6313, 198, 102376, 100090, 7948, 102788, 105523, 107477, 65676, 5691, 11319, 99381, 241, 99971, 100042, 5691, 8997, 105285, 40820, 118666, 3837, 101935, 80158, 101611, 35946, 109195, 3837, 36587, 100644, 30534, 26939, 106356, 5373, 105943, 45629, 100090, 7948, 1773, 35946, 111912, 3837, 102483, 99226, 34187, 3837, 110665, 109195, 1773, 111505, 99391, 99938, 3837, 109585, 16628, 102214, 3837, 80158, 33108, 102208, 5373, 101935, 100018, 99901, 39953, 104374, 106356, 45629, 8997, 106356, 45629, 18493, 99474, 16872, 3837, 100343, 101467, 99828, 110070, 80158, 99495, 1773, 35946, 104789, 104990, 106356, 45629, 3837, 106356, 5373, 105943, 106779, 18493, 102458, 111801, 34187, 1773, 35946, 14777, 101038, 106356, 5373, 105943, 3837, 80158, 102483, 29490, 99882, 109412, 36987, 106356, 5373, 105943, 3837, 97639, 36407, 116416, 100090, 7948, 34187, 75758, 106356, 5373, 105943, 99350, 106397, 29490, 48738, 18947, 101677, 8997, 105480, 106356, 5373, 105943, 45629, 3837, 99650, 80158, 104169, 99674, 104686, 106678, 100413, 3837, 18830, 104618, 5373, 18830, 100443, 5373, 18830, 109378, 104008, 1773, 35946, 104203, 99405, 3837, 106356, 104203, 107557, 36987, 100134, 108702, 3837, 104710, 101300, 55807, 39165, 102654, 35946, 118566, 56006, 103982, 111244, 101300, 13343, 3837, 106356, 102483, 29490, 103206, 3837, 113093, 101651, 35946, 99929, 105414, 99518, 117045, 90395, 115833, 46944, 108052, 8997, 35946, 102483, 29490, 29077, 102346, 108052, 3837, 54926, 36587, 102570, 1773, 77288, 105519, 3837, 35946, 106961, 101181, 101208, 100626, 101235, 104701, 3837, 99999, 3837, 35946, 116091, 28291, 103421, 5122, 18493, 16628, 100695, 69249, 3837, 103962, 101896, 116240, 29490, 100134, 3837, 100627, 100716, 3837, 108287, 106961, 101181, 104701, 8997, 111505, 15946, 99938, 3837, 106235, 108526, 34187, 106356, 5373, 105943, 3837, 99901, 39953, 104122, 34187, 8997, 100090, 7948, 115672, 20412, 14224, 65676, 5691, 11319, 99381, 241, 99971, 5691, 8997, 105285, 40820, 3837, 35946, 112181, 109195, 3837, 109585, 16628, 102214, 5373, 16628, 113233, 33108, 16628, 110383, 3837, 101077, 99557, 110961, 100626, 117999, 14053, 85336, 101849, 106356, 45629, 85336, 100090, 7948, 3837, 35946, 30440, 102313, 34187, 8997, 101849, 106356, 45629, 18493, 42144, 109, 17447, 3837, 99486, 85336, 99582, 100230, 100696, 3837, 99165, 59258, 1773, 102865, 45629, 109967, 106825, 99593, 100408, 107500, 3837, 105285, 40820, 85336, 100090, 101506, 108560, 42140, 3837, 107500, 17447, 54926, 46944, 34794, 99901, 24156, 104338, 3837, 101228, 102248, 115507, 9973, 6313, 35946, 54926, 70790, 100371, 104338, 3837, 108432, 18830, 110961, 102865, 102144, 1773, 16530, 106868, 80158, 99495, 42144, 109, 17447, 3837, 104853, 39953, 3837, 99540, 17254, 99246, 103287, 99639, 59956, 59956, 102199, 97706, 99992, 45629, 31235, 113911, 102297, 59074, 3837, 101935, 111267, 105275, 97084, 105950, 1773, 117222, 3837, 80158, 104356, 101849, 106356, 103348, 99333, 27091, 29490, 33108, 97639, 116657, 3837, 35946, 99694, 39426, 103427, 36987, 101849, 99989, 75107, 91050, 7948, 52801, 75758, 101849, 106356, 36587, 36987, 107924, 52801, 6313, 107924, 52801, 75758, 107484, 3837, 80158, 99213, 99164, 97639, 104071, 42411, 45629, 3837, 101849, 106356, 45629, 106789, 63703, 99432, 3837, 30440, 106488, 99314, 99180, 103425, 103138, 103138, 3837, 116493, 99847, 17340, 34187, 8997, 14777, 104400, 101849, 106356, 45629, 3837, 99650, 113641, 102871, 106678, 3837, 18830, 100857, 44729, 5373, 108202, 102030, 5373, 100180, 44729, 5373, 101580, 99251, 14053, 3837, 100132, 109366, 99405, 3837, 35946, 104203, 99405, 99164, 100413, 3837, 104203, 101952, 100234, 1773, 99650, 97706, 107557, 3837, 100134, 108702, 3837, 104710, 101300, 8997, 97639, 99437, 99610, 99437, 99610, 80158, 26939, 99405, 117371, 20450, 34187, 3837, 35946, 26288, 39426, 26288, 39426, 29490, 99405, 99164, 3837, 101849, 99680, 11622, 46944, 99491, 99491, 30709, 100050, 100749, 3837, 104169, 99495, 108223, 100749, 100167, 102461, 3837, 35946, 100399, 117459, 3837, 101228, 102308, 99226, 34187, 8997, 109327, 99938, 106735, 3837, 101849, 105943, 115833, 46944, 108052, 3837, 100549, 35946, 102231, 100134, 1773, 35946, 102483, 29490, 111682, 108052, 3837, 105317, 70074, 2073, 102570, 33590, 108526, 34187, 101849, 105943, 3837, 20742, 117999, 80158, 113305, 39953, 36605, 97639, 104122, 34187, 8997, 108668, 3837, 100090, 7948, 100681, 30440, 88051, 52801, 9973, 6313, 151645, 44, 51413, 13422, 7777, 11670, 36600, 16502, 645, 37343, 644, 35246, 13, 40436, 2636, 2922, 3346, 1424, 4844, 323, 21511, 10450, 645, 13, 472, 73535, 13, 151645]
+[INFO:swift] [INPUT] 在平凡的学习、工作、生活中，大家都写过作文吧，作文是由文字组成，经过人的思想考虑，通过语言组织来表达一个主题意义的文体。那么一般作文是怎么写的呢？以下是小编精心整理的大年初一作文7篇，欢迎阅读，希望大家能够喜欢。
+大年初一作文 篇1
+“放鞭炮，贴春联，穿新衣，收红包”是每年春节都必须做的几件事，今年也不例外。不过，过年拜年是最有趣的。
+大年初一一早，我早早地起了床，穿好新衣，好好地打扮了一下，我上身穿着白色羊绒衫和黑白相间的小裙子，下身穿着紧身的打底裤，外面套上一件渐变色的羽绒衫，搭配得自然协调，真是美极了！一切都准备好了，爸爸开着小汽车，带着一家人，向老家前进！
+此时的我激动极了！这是我盼望已久的春节��！
+一路上，我们说说笑笑，看看路边的风景，也别是一番风趣。公路两旁的大树高大挺拔，小草绿油油的，穿着一件雪白雪白的棉袄，真是一幅美丽的冬日画卷��！
+终于到老家了，我开心地蹦下了车，拎着手提包，拉着爸爸妈妈的手一起去拜年了！
+首先，我们到了姨奶奶家，我走了过去，祝姨奶奶：“福如东海寿比南山�！币棠棠涕_心地笑了。抓了一大把糖给了我，我把糖放进了包里，开心极了。心想：现在人们的生活水平提高了！不愁吃，不愁穿的，真好。
+接下来，我们去了三姑妈家，爸爸一声大喊：“拜年的到了！”我走了上去祝三姑妈：“财源滚滚！” 三姑妈家乐开了花，连连称赞我。
+随后，我们还去了姑奶奶，二舅，二姑妈……家。
+今天，我收获了很多，同时也很快乐！新年Happy！
+梅花伴雪舞，祥龙迎春归。和光布德泽，万物沐新辉。在这个短暂的寒假里，我和老妈和小姨一家一起过年，为什么说大年初一是惊险的呢？请听我慢慢道来。
+往常大年初一是在鞭炮声中度过，于是我们就计划早早吃过饭到院子里，放孔明灯。我三下五除二把三个孔明灯打开和老妈写下祝福，我们拿着打火机和孔明灯，兴冲冲的来到院子里，准备放。
+只见我和我姨夫把孔明灯提起来，让老妈点燃底部的.蜡烛。我们耐心等待着，大约过了一分钟，我和我姨夫就放开孔明灯，只见孔明灯自己缓缓上升，里面的烛光摇曳著，我们的目光也随着孔明灯的上升�？勺屏覀内f万没想到的是：
+我们大家的心都悬起来了，刚开始的新鲜感也没有了，我生怕孔明灯会烧了电线，心突突的跳，手心里出了汗。此刻，我们大家只希望孔明灯能上升，别停留在电线旁。我那颗忐忑不安的心越跳越快，我都不敢想象惨绝人寰的恶果。
+在全家人的“痴望”中，我突然想起要不要报警，于是我就说：“要不要报警，万一孔明灯的金属丝导电，怎么办？”正当我们准备打电话时，让我们意想不到又欣喜万分的事���生了—“孔明灯又徐徐上升了！”“原来是里面的热空气太少，”我松了一口气，“像热气球一样，吓死我了�！贝蠹叶妓闪艘豢跉�，如释重担。这真是虚惊一场！
+这个孔明灯让我们过了个惊险的大年初一，但也让我们难忘，我也要提醒大家过年时放鞭炮、放孔明灯和别的爆竹时，注意安全，别像我们这样惊险。
+春节是我国每年最盛大隆重的节日。我的家乡处于南方，那我就向大家介绍一下南方的春节习俗吧。
+大年三十，小孩和大人们都要早早的起床，洗漱好了，我们就开始吃早饭了，茶叶蛋是不可少的食物，它象征著团团圆圆。粥也是不可少的，它象征著多子多福。
+吃完了早饭，我们就开始贴春联了。首先把春联移到正确的地方，再把四个角贴上透明胶就行了。贴福字时，要倒著贴，表示福到了。
+到了下午两点多钟，我们就要换上新衣服。在门口点燃炮竹。点燃后就可吃年夜饭了。鱼是不可少的食物，它象征著年年有余。还有一道既营养还可口的菜，那就是玉米粒，它象征著荣华富贵。
+到了晚上时，家家户户都放起了烟花。天空顿时变成了烟花的世界，那烟花绚丽多彩，美丽极了，让人目不暇接，过完了春节，新的一年又开始了，大人和小孩们都进入了紧张的工作和学习中，祝大家工作顺利，学习进步。
+今年的大年初一有点特别，因为老天下起了一场美丽的大雪。
+这就是我大年初一的一天，这也是我快乐的一天。
+大年初一的晚上，弟弟来到我家玩，我和他商量：“咱们来做灯笼吧�！钡艿芤豢诒愦馍�。
+我们找了一个废酒盒子；用剪刀把四面都挖空，留住四个角。又用土办法做“糨子”把纸粘在上面，里面再固定一根蜡烛，这样，我们的灯笼就成功了。
+爷爷走过来，看着我们做好的灯笼说：“大过年的，白颜色不吉利，扔了再重做吧！”我心想：“人家花半天工夫做的灯笼就这样扔掉？”忽然，我有了个主意：搬来两个大饮料瓶，把瓶子上红色标签撕下来，贴在上面。
+恰好今天又是爷爷生日，我用自己的零花钱去买了个蛋糕回来给爷爷吃，回到家才发现亲人也来了，只剩下爷爷没来。
+趁爷爷没来的时候，我把蛋糕拿出来插上蜡烛。爷爷来了，祝寿也开始了。一簇簇燃烧的火苗组成一朵吉祥的莲花，映照着爷爷幸福的脸庞，60根彩色的蜡烛也跳动着我们的60个祝福。
+爷爷吹完蜡烛，我们开始分享美味的蛋糕。我灵机一动，把蛋糕上的奶油一下子抹在爷爷的脸上。
+哇噻！爷爷又返老还童了！
+春节拜年对我来讲是一件非�？鞓肥虑�。
+年初二一大早，妈妈就催我起床，说今天要到爷爷、奶奶家拜年。我一听，高兴极了，连忙起床。吃过早饭，穿上新衣服，就和爸爸、妈妈一起坐车前往爷爷家。
+爷爷家在乡下，汽车开了不到半小时就到了。我还没走到爷爷家，爷爷、奶奶就已经在门口等候了。我一看到爷爷、奶奶，就高兴地叫起来了：“爷爷、奶奶，我们来给你们拜年了！”爷爷、奶奶乐呵呵地笑个不停。
+进了爷爷、奶奶家，他们就给我拿了很多好吃东西，有水果、有糖、有花生等等。我一边吃，爷爷一边问我：“学习好不好，有没有进步”。当听说我学习成绩比以前有很大进步时，爷爷高兴地笑了，连连夸我既聪明又懂事，并给了我一个红包。
+我高兴地接过了红包，连说谢谢。但我知道，我与其他同学相比还有很大差距，所以，我暗暗发誓：在新一年里，一定要更加刻苦地学习，提高成绩，缩小与其他同学差距。
+吃过中饭，我们就告别了爷爷、奶奶，坐车回家了。
+拜年对我来说是件非�？鞓肥�。
+年初二，我早早起床，穿上新衣服、新裤子和新鞋子，准备跟爸爸妈妈还有舅舅……去舅爷爷家去拜年，我可开心了。
+舅爷爷家在墱上，就是去贵池方向，很近。在我家门前乘坐了一辆公交车，年初二去拜年人还真多，公交车上连一个空坐位都没有，真是人群拥挤��！我连站地方都没有，还好有爸爸妈妈在我身边。不一会儿就到了墱上，下了车，印入眼帘是一排排房子还有一家最耀眼购物城，妈妈在里面买了些礼物。不远处，就看见舅爷爷笑容满面地和我们打招呼，我脱口而出：“舅爷爷新年好！”舅爷爷说：“新年好！新年好！”说完，就领着我们来到他家，舅爷爷家住在四楼，可把我走气喘吁吁，实在是太累人了。
+一走进舅爷爷家，他们拿来好多好吃，有瓜子、杏仁、松子、葡萄干……，都是我喜欢吃，我一边吃着东西，一边看着电视。他们还问我，学习好不好，有没有进步。
+我们谈著谈著就到吃午饭时间了，我大口大口地吃着，舅妈用一个非常非常小纸杯，给我到了一小杯雪碧，我喝了一口，真是爽极了。
+吃完饭过后，舅奶奶给了我一个红包，祝我好好学习。我高兴地接过红包，说了声“谢谢”，告别了舅奶奶，表舅舅就开着车送我们回家了。
+哇，拜年感觉可真好��！<|im_end|>Maju kan nggon ku kaie uwong eh. Kabeh enggo handphone and smartphone waie. Hahaha.<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, 107167, 105595, 5373, 99257, 5373, 102438, 3837, 104695, 61443, 38182, 104745, 100003, 3837, 104745, 104625, 87335, 101286, 3837, 101897, 103947, 100383, 101118, 3837, 67338, 102064, 99877, 36407, 102124, 46944, 100220, 100240, 9370, 111048, 1773, 100624, 100141, 104745, 107343, 107548, 101036, 11319, 114566, 100452, 105191, 104387, 104197, 105285, 14777, 104745, 22, 99824, 3837, 100437, 101113, 3837, 109477, 100006, 99729, 8997, 26288, 105285, 14777, 104745, 10236, 107, 229, 16, 198, 2073, 53222, 111241, 104444, 3837, 99934, 99528, 49082, 3837, 99621, 16628, 99741, 3837, 50009, 108052, 854, 20412, 101988, 102376, 71268, 100645, 100854, 99195, 108008, 3837, 88308, 117159, 1773, 100632, 3837, 107954, 100090, 7948, 104890, 107935, 8997, 26288, 105285, 108739, 99391, 3837, 35946, 112181, 29490, 102300, 99830, 3837, 99621, 52801, 16628, 99741, 3837, 102231, 29490, 109979, 104781, 3837, 35946, 17447, 95256, 105611, 102440, 101187, 108040, 103167, 33108, 111877, 48921, 17881, 104006, 114339, 3837, 16872, 95256, 105611, 99378, 95256, 9370, 75437, 99413, 102693, 3837, 102586, 99619, 17447, 101347, 100434, 74040, 38035, 9370, 101433, 108040, 103167, 3837, 104402, 49828, 99795, 102020, 3837, 101228, 57566, 99226, 34187, 6313, 107967, 101077, 104334, 3837, 102208, 113305, 30709, 100343, 3837, 104038, 110272, 3837, 69041, 107723, 105883, 6313, 198, 104276, 9370, 35946, 105694, 99226, 34187, 6313, 108907, 117388, 117190, 102376, 9973, 6313, 198, 113827, 3837, 97639, 36587, 36587, 116423, 3837, 101997, 108759, 9370, 104996, 3837, 74763, 62922, 99639, 86117, 99208, 99594, 1773, 102175, 77540, 100484, 104197, 99613, 44636, 26288, 101174, 101355, 3837, 30709, 99808, 99679, 99318, 99318, 9370, 3837, 105611, 101347, 100167, 117638, 99243, 9370, 102430, 119364, 3837, 88051, 99639, 99708, 105664, 99949, 8903, 117464, 9973, 6313, 198, 104020, 26939, 107723, 34187, 3837, 35946, 102313, 29490, 112198, 104853, 39953, 3837, 115232, 108538, 28072, 67279, 3837, 111128, 110961, 105871, 110926, 100090, 7948, 34187, 6313, 198, 101140, 3837, 97639, 99495, 103088, 105943, 45629, 3837, 35946, 104398, 100688, 3837, 100549, 103088, 105943, 36987, 99477, 29524, 111903, 100523, 56006, 110194, 5691, 6313, 99945, 103755, 103755, 119392, 62, 63109, 29490, 103206, 1773, 99745, 99593, 26288, 99360, 100443, 115833, 3837, 110267, 100443, 53222, 105480, 67279, 69249, 3837, 102313, 99226, 34187, 1773, 108954, 5122, 99601, 100659, 104103, 100021, 105953, 6313, 16530, 105153, 99405, 3837, 16530, 105153, 99621, 9370, 3837, 88051, 52801, 8997, 104326, 3837, 97639, 102149, 44991, 101371, 99680, 45629, 3837, 102208, 104494, 26288, 104611, 36987, 100090, 104300, 99495, 75758, 35946, 104398, 102608, 100549, 44991, 101371, 99680, 36987, 99421, 37984, 116322, 75758, 220, 44991, 101371, 99680, 45629, 99350, 101467, 99232, 3837, 113093, 111121, 35946, 8997, 104221, 3837, 97639, 97706, 102149, 101371, 105943, 3837, 40820, 101849, 3837, 40820, 101371, 99680, 14053, 45629, 8997, 100644, 3837, 35946, 104619, 104686, 3837, 104979, 102154, 99350, 6313, 107924, 32847, 6313, 198, 112128, 99595, 100167, 100066, 3837, 102197, 99465, 99641, 99528, 100040, 1773, 33108, 99225, 51827, 99462, 100251, 3837, 108441, 103303, 16628, 101253, 1773, 104596, 108658, 9370, 116467, 69249, 3837, 105786, 117019, 33108, 30709, 103088, 101949, 100018, 107954, 3837, 100678, 36587, 26288, 105285, 104173, 99851, 99567, 9370, 101036, 11319, 14880, 49187, 35946, 101283, 44793, 36407, 8997, 99321, 38953, 26288, 105285, 14777, 101219, 111241, 104444, 70074, 15946, 106517, 3837, 101959, 106235, 101039, 112181, 111505, 99938, 26939, 118184, 3837, 53222, 100451, 30858, 100183, 1773, 35946, 44991, 16872, 75108, 20755, 40820, 99360, 101124, 100451, 30858, 100183, 104089, 33108, 117019, 111435, 105514, 3837, 97639, 105939, 75437, 79599, 32648, 33108, 100451, 30858, 100183, 3837, 99355, 99907, 99907, 9370, 104071, 118184, 3837, 101077, 53222, 8997, 107053, 105786, 35946, 103088, 99818, 99360, 100451, 30858, 100183, 28072, 99793, 3837, 99258, 117019, 111678, 108304, 9370, 13, 109703, 115257, 1773, 97639, 105779, 104525, 99164, 3837, 104995, 38182, 99593, 83031, 3837, 105786, 35946, 103088, 99818, 80158, 109482, 100451, 30858, 100183, 3837, 107053, 100451, 30858, 100183, 99283, 108270, 104291, 3837, 107172, 115257, 99225, 100307, 119607, 99610, 3837, 103952, 102837, 74763, 101067, 100451, 30858, 100183, 9370, 104291, 5691, 11319, 109148, 100254, 24071, 222, 31843, 69, 31207, 104791, 100146, 28311, 97639, 99466, 101421, 71268, 100588, 109412, 3837, 108250, 104267, 99705, 98650, 102114, 34187, 3837, 35946, 116347, 100451, 30858, 100183, 36993, 100228, 34187, 111053, 3837, 63109, 99624, 99624, 9370, 100421, 3837, 44934, 101998, 100195, 100868, 1773, 106210, 3837, 97639, 99466, 91680, 99880, 100451, 30858, 100183, 26232, 104291, 3837, 62922, 112151, 111053, 100484, 1773, 35946, 99212, 101492, 119691, 119680, 107352, 101421, 99236, 100421, 99236, 99234, 3837, 35946, 116578, 102265, 105003, 99631, 17340, 118082, 9370, 99695, 27773, 8997, 18493, 108071, 103947, 2073, 103082, 99317, 854, 15946, 3837, 35946, 103961, 105545, 111343, 106125, 3837, 101959, 104115, 36587, 36987, 111343, 106125, 3837, 110622, 100451, 30858, 100183, 9370, 100843, 99691, 64720, 38212, 3837, 102572, 81264, 107331, 97639, 101077, 106202, 13343, 3837, 104233, 117245, 99518, 114434, 113793, 104140, 105007, 2293, 2073, 100451, 30858, 100183, 99518, 101957, 101957, 104291, 34187, 75758, 2073, 110709, 107172, 99259, 100819, 116221, 3837, 854, 35946, 100180, 114551, 41505, 65101, 99259, 99180, 77959, 101891, 3837, 102636, 99561, 35946, 34187, 5691, 6313, 100458, 122113, 100027, 119128, 99869, 102984, 121285, 35496, 231, 5691, 3837, 29524, 68862, 29258, 99693, 1773, 43288, 101228, 100226, 99851, 104099, 6313, 198, 99487, 100451, 30858, 100183, 104233, 102346, 18947, 99851, 99567, 104197, 105285, 14777, 3837, 106884, 104233, 110586, 3837, 104284, 30534, 104211, 99466, 107954, 13343, 53222, 111241, 104444, 5373, 53222, 100451, 30858, 100183, 33108, 102657, 99849, 102045, 13343, 3837, 60533, 99464, 3837, 62922, 65101, 97639, 99654, 99851, 99567, 8997, 102376, 108659, 101988, 31235, 109815, 103446, 9370, 105832, 1773, 97611, 105686, 101199, 104417, 3837, 99212, 104115, 69041, 99466, 109432, 104417, 9370, 102376, 112322, 100003, 8997, 26288, 7948, 105134, 3837, 104902, 33108, 26288, 100659, 104278, 112181, 9370, 109195, 3837, 99634, 119477, 104334, 3837, 97639, 106138, 99405, 99391, 99938, 34187, 3837, 105859, 100464, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 99305, 118673, 100213, 1773, 106864, 100000, 100186, 82647, 9370, 3837, 99652, 106411, 99610, 42140, 44729, 42140, 99477, 8997, 99405, 106570, 99391, 99938, 3837, 97639, 106138, 99934, 99528, 49082, 34187, 1773, 101140, 99360, 99528, 49082, 98279, 88991, 103958, 3837, 87256, 99360, 100802, 63836, 99934, 17447, 105279, 100773, 111786, 1773, 99934, 99477, 18600, 13343, 3837, 30534, 99805, 99610, 99934, 3837, 51463, 99477, 99495, 8997, 99495, 102172, 77540, 110869, 75061, 3837, 97639, 104134, 71134, 17447, 16628, 102214, 1773, 18493, 102458, 111678, 104444, 102045, 1773, 111678, 33447, 80158, 30440, 99405, 107946, 99938, 34187, 1773, 100655, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 7948, 7948, 18830, 88683, 1773, 97706, 99992, 44793, 99929, 102100, 109861, 39426, 9370, 99800, 3837, 105139, 105863, 101425, 3837, 99652, 106411, 99610, 99832, 85361, 111226, 8997, 99495, 104030, 13343, 3837, 45629, 45629, 17523, 17523, 71268, 53222, 102300, 103850, 1773, 101916, 106519, 105231, 103850, 105484, 3837, 99212, 103850, 116943, 103585, 3837, 101280, 99226, 34187, 3837, 103973, 29732, 16530, 116435, 29077, 3837, 38182, 106570, 102376, 3837, 118581, 99518, 107651, 3837, 105666, 33108, 104902, 111656, 107847, 104432, 104066, 33108, 100134, 15946, 3837, 100549, 99466, 99257, 102088, 3837, 100134, 101300, 8997, 88308, 104197, 105285, 14777, 104037, 100654, 3837, 99519, 91777, 104128, 71618, 112096, 101280, 104197, 100167, 8997, 104301, 35946, 26288, 105285, 14777, 99774, 35727, 3837, 104624, 35946, 104053, 99774, 35727, 8997, 26288, 105285, 14777, 9370, 104030, 3837, 106047, 104071, 106954, 99366, 3837, 105786, 42411, 109216, 36987, 104335, 110138, 114154, 100003, 5691, 6313, 120144, 119557, 122204, 121285, 119773, 121059, 119142, 170, 222, 254, 5691, 8997, 97639, 92133, 104059, 99857, 99525, 110792, 24968, 11622, 102634, 101953, 99360, 63703, 27091, 71268, 101413, 34794, 3837, 114312, 100802, 63836, 1773, 99518, 11622, 72990, 100241, 99190, 2073, 121999, 44729, 854, 99360, 100050, 104957, 18493, 104135, 3837, 100420, 87256, 101358, 106804, 109703, 115257, 3837, 99654, 3837, 103952, 114154, 80158, 19108, 34187, 8997, 106356, 99314, 101180, 3837, 101952, 97639, 99190, 99692, 114154, 36587, 36987, 26288, 38182, 104300, 3837, 99243, 102284, 16530, 108738, 3837, 105603, 34187, 87256, 29258, 99190, 100003, 75758, 35946, 108954, 36987, 104389, 99232, 103554, 111031, 100854, 114154, 106041, 105603, 100373, 81264, 105420, 3837, 35946, 101144, 18947, 110721, 5122, 100561, 36407, 100369, 26288, 106099, 100822, 3837, 99360, 118579, 17447, 104165, 105151, 107495, 100194, 3837, 99934, 18493, 104135, 8997, 113573, 100644, 104458, 106356, 105511, 3837, 35946, 111256, 99822, 108032, 85336, 105275, 18947, 107000, 104150, 89012, 106356, 99405, 3837, 112696, 109785, 108927, 74763, 101161, 3837, 110567, 106356, 70927, 36407, 8997, 103083, 106356, 70927, 36407, 103920, 3837, 110267, 107000, 113121, 101255, 17447, 109703, 115257, 1773, 106356, 101161, 3837, 100549, 100523, 74763, 107651, 1773, 14777, 112594, 112594, 106578, 9370, 79599, 101233, 101286, 114177, 109003, 9370, 110914, 3837, 100261, 99331, 99164, 106356, 101954, 107288, 101625, 3837, 21, 15, 99408, 110350, 9370, 109703, 115257, 74763, 100421, 27733, 99164, 103952, 21, 15, 18947, 105514, 8997, 106356, 102203, 46306, 109703, 115257, 3837, 97639, 55286, 93149, 106800, 9370, 107000, 1773, 35946, 99677, 32648, 14777, 27733, 3837, 99360, 107000, 101913, 118509, 103077, 101537, 18493, 106356, 9370, 104488, 8997, 108668, 121769, 6313, 106356, 99518, 100514, 91777, 97706, 99914, 34187, 6313, 198, 102376, 100090, 7948, 102788, 105523, 107477, 65676, 5691, 11319, 99381, 241, 99971, 100042, 5691, 8997, 105285, 40820, 118666, 3837, 101935, 80158, 101611, 35946, 109195, 3837, 36587, 100644, 30534, 26939, 106356, 5373, 105943, 45629, 100090, 7948, 1773, 35946, 111912, 3837, 102483, 99226, 34187, 3837, 110665, 109195, 1773, 111505, 99391, 99938, 3837, 109585, 16628, 102214, 3837, 80158, 33108, 102208, 5373, 101935, 100018, 99901, 39953, 104374, 106356, 45629, 8997, 106356, 45629, 18493, 99474, 16872, 3837, 100343, 101467, 99828, 110070, 80158, 99495, 1773, 35946, 104789, 104990, 106356, 45629, 3837, 106356, 5373, 105943, 106779, 18493, 102458, 111801, 34187, 1773, 35946, 14777, 101038, 106356, 5373, 105943, 3837, 80158, 102483, 29490, 99882, 109412, 36987, 106356, 5373, 105943, 3837, 97639, 36407, 116416, 100090, 7948, 34187, 75758, 106356, 5373, 105943, 99350, 106397, 29490, 48738, 18947, 101677, 8997, 105480, 106356, 5373, 105943, 45629, 3837, 99650, 80158, 104169, 99674, 104686, 106678, 100413, 3837, 18830, 104618, 5373, 18830, 100443, 5373, 18830, 109378, 104008, 1773, 35946, 104203, 99405, 3837, 106356, 104203, 107557, 36987, 100134, 108702, 3837, 104710, 101300, 55807, 39165, 102654, 35946, 118566, 56006, 103982, 111244, 101300, 13343, 3837, 106356, 102483, 29490, 103206, 3837, 113093, 101651, 35946, 99929, 105414, 99518, 117045, 90395, 115833, 46944, 108052, 8997, 35946, 102483, 29490, 29077, 102346, 108052, 3837, 54926, 36587, 102570, 1773, 77288, 105519, 3837, 35946, 106961, 101181, 101208, 100626, 101235, 104701, 3837, 99999, 3837, 35946, 116091, 28291, 103421, 5122, 18493, 16628, 100695, 69249, 3837, 103962, 101896, 116240, 29490, 100134, 3837, 100627, 100716, 3837, 108287, 106961, 101181, 104701, 8997, 111505, 15946, 99938, 3837, 106235, 108526, 34187, 106356, 5373, 105943, 3837, 99901, 39953, 104122, 34187, 8997, 100090, 7948, 115672, 20412, 14224, 65676, 5691, 11319, 99381, 241, 99971, 5691, 8997, 105285, 40820, 3837, 35946, 112181, 109195, 3837, 109585, 16628, 102214, 5373, 16628, 113233, 33108, 16628, 110383, 3837, 101077, 99557, 110961, 100626, 117999, 14053, 85336, 101849, 106356, 45629, 85336, 100090, 7948, 3837, 35946, 30440, 102313, 34187, 8997, 101849, 106356, 45629, 18493, 42144, 109, 17447, 3837, 99486, 85336, 99582, 100230, 100696, 3837, 99165, 59258, 1773, 102865, 45629, 109967, 106825, 99593, 100408, 107500, 3837, 105285, 40820, 85336, 100090, 101506, 108560, 42140, 3837, 107500, 17447, 54926, 46944, 34794, 99901, 24156, 104338, 3837, 101228, 102248, 115507, 9973, 6313, 35946, 54926, 70790, 100371, 104338, 3837, 108432, 18830, 110961, 102865, 102144, 1773, 16530, 106868, 80158, 99495, 42144, 109, 17447, 3837, 104853, 39953, 3837, 99540, 17254, 99246, 103287, 99639, 59956, 59956, 102199, 97706, 99992, 45629, 31235, 113911, 102297, 59074, 3837, 101935, 111267, 105275, 97084, 105950, 1773, 117222, 3837, 80158, 104356, 101849, 106356, 103348, 99333, 27091, 29490, 33108, 97639, 116657, 3837, 35946, 99694, 39426, 103427, 36987, 101849, 99989, 75107, 91050, 7948, 52801, 75758, 101849, 106356, 36587, 36987, 107924, 52801, 6313, 107924, 52801, 75758, 107484, 3837, 80158, 99213, 99164, 97639, 104071, 42411, 45629, 3837, 101849, 106356, 45629, 106789, 63703, 99432, 3837, 30440, 106488, 99314, 99180, 103425, 103138, 103138, 3837, 116493, 99847, 17340, 34187, 8997, 14777, 104400, 101849, 106356, 45629, 3837, 99650, 113641, 102871, 106678, 3837, 18830, 100857, 44729, 5373, 108202, 102030, 5373, 100180, 44729, 5373, 101580, 99251, 14053, 3837, 100132, 109366, 99405, 3837, 35946, 104203, 99405, 99164, 100413, 3837, 104203, 101952, 100234, 1773, 99650, 97706, 107557, 3837, 100134, 108702, 3837, 104710, 101300, 8997, 97639, 99437, 99610, 99437, 99610, 80158, 26939, 99405, 117371, 20450, 34187, 3837, 35946, 26288, 39426, 26288, 39426, 29490, 99405, 99164, 3837, 101849, 99680, 11622, 46944, 99491, 99491, 30709, 100050, 100749, 3837, 104169, 99495, 108223, 100749, 100167, 102461, 3837, 35946, 100399, 117459, 3837, 101228, 102308, 99226, 34187, 8997, 109327, 99938, 106735, 3837, 101849, 105943, 115833, 46944, 108052, 3837, 100549, 35946, 102231, 100134, 1773, 35946, 102483, 29490, 111682, 108052, 3837, 105317, 70074, 2073, 102570, 33590, 108526, 34187, 101849, 105943, 3837, 20742, 117999, 80158, 113305, 39953, 36605, 97639, 104122, 34187, 8997, 108668, 3837, 100090, 7948, 100681, 30440, 88051, 52801, 9973, 6313, 151645, -100, 51413, 13422, 7777, 11670, 36600, 16502, 645, 37343, 644, 35246, 13, 40436, 2636, 2922, 3346, 1424, 4844, 323, 21511, 10450, 645, 13, 472, 73535, 13, 151645]
+[INFO:swift] [LABELS] [-100 * 1]平凡的学习、工作、生活中，大家都���过作文吧，作文是由文字组成，经过人的思想考虑，通过语言组织来表达一个主题意义的文体。那么一般作文是怎么写的呢？以下是小编精心整理的大年初一作文7篇，欢迎阅读，希望大家能够喜欢。
+大年初一作文 篇1
+“放鞭炮，贴春联，穿新衣，收红包”是每年春节都必须做的几件事，今年也不例外。不过，过年拜年是最有趣的。
+大年初一一早，我早早地起了床，穿好新衣，好好地打扮了一下，我上身穿着白色羊绒衫和黑白相间的小裙子，下身穿着紧身的打底裤，外面套上一件渐变色的羽绒衫，搭配得自然协调，真是美极了！一切都准备好了，爸爸开着小汽车，带着一家人，向老家前进！
+此时的我激动极了！这是我盼望已久的春节��！
+一路上，我们说说笑笑，看看路边的风景，也别是一番风趣。公路两旁的大树高大挺拔，小草绿油油的，穿着一件雪白雪白的棉袄，真是一幅美丽的冬日画卷��！
+终于到老家了，我开心地蹦下了车，拎着手提包，拉着爸爸妈妈的手一起去拜年了！
+首先，我们到了姨奶奶家，我走了过去，祝姨奶奶：“福如东海寿比南山�！币棠棠涕_心地笑了。抓了一大把糖给了我，我把糖放进了包里，开心极了。心想：现在人们的生活水平提高了！不愁吃，不愁穿的，真好。
+接下来，我们去了三姑妈家，爸爸一声大喊：“拜年的到了！”我走了上去祝三姑妈：“财源滚滚！” 三姑妈家乐开了花，连连称赞我。
+随后，我们还去了姑奶奶，二舅，二姑妈……家。
+今天，我收获了很多，同时也很快乐！新年Happy！
+梅花伴雪舞，祥龙迎春归。和光布德泽，万物沐新辉。在这个短暂的寒假里，我和老妈和小姨一家一起过年，为什么说大年初一是惊险的呢？请听我慢慢道来。
+往常大年初一是在鞭炮声中度过，于是我们就计划早早吃过饭到院子里，放孔明灯。我三下五除二把三个孔明灯打开和老妈写下祝福，我们拿着打火机和孔明灯，兴冲冲的来到院子里，准备放。
+只见我和我姨夫把孔明灯提起来，让老妈点燃底部的.蜡烛。我们耐心等待着，大约过了一分钟，我和我姨夫就放开孔明灯，只见孔明灯自己缓缓上升，里面的烛光摇曳著，我们的目光也随着孔明灯的上升�？勺屏覀内f万没想到的是：
+我们大家的心都悬起来了，刚开始的新鲜感也没有了，我生怕孔明灯会烧了电线，心突突的跳，手心里出了汗。此刻，我们大家只希望孔明灯能上升，别停留在电线旁。我那颗忐忑不安的心越跳越快，我都不敢想象惨绝人寰的恶果。
+在全家人的“痴望”中，我突然想起要不要报警，于是我就说：“要不要报警，万一孔明灯的金属丝导电，怎么办？”正当我们准备打电话时，让我们意想不到又欣喜万分的事发生了—“孔明灯又徐徐上升了！”“原来是里面的热空气太少，”我松了一口气，“像热气球一样，吓死我了�！贝蠹叶妓闪艘豢跉�，如释重担。这真是虚惊一场！
+这个孔明灯让我们过了个惊险的大年初一，但也让我们难忘，我也要提醒大家过年时放鞭炮、放孔明灯和别的爆竹时，注意安全，别像我们这样惊险。
+春节是我国每年最盛大隆重的节日。我的家乡处于南方，那我就向大家介绍一下南方的春节习俗吧。
+大年三十，小孩和大人们都要早早的起床，洗漱好了，我们就开始吃早饭了，茶叶蛋是不可少的食物，它象征著团团圆圆。粥也是不可少的，它象征著多子多福。
+吃完了早饭，我们就开始贴春联了。首先把春联移到正确的地方，再把四个角贴上透明胶就行了。贴福字时，要倒著贴，表示福到了。
+到了下午两点多钟，我们就要换上新衣服。在门口点燃炮竹。点燃后就可吃年夜饭了。鱼是不可少的食物，它象征著年年有余。还有一道既营养还可口的菜，那就是玉米粒，它象征著荣华富贵。
+到了晚上时，家家户户都放起了烟花。天空顿时变成了烟花的世界，那烟花绚丽多彩，美丽极了，让人目不暇接，过完了春节，新的一年又开始了，大人和小孩们都进入了紧张的工作和学习中，祝大家工作顺利，学习进步。
+今年的大年初一有点特别，因为老天下起了一场美丽的大雪。
+这就是我大年初一的一天，这也是我快乐的一天。
+大年初一的晚上，弟弟来到我家玩，我和他商量：“咱们来做灯笼吧�！钡艿芤豢诒愦馍�。
+我们找了一个废酒盒子；用剪刀把四面都挖空，留住四个角。又用土办法做“糨子”把纸粘在上面，里面再固定一根蜡烛，这样，我们的灯笼就成功了。
+爷爷走过��，看着我们做好的灯笼说：“大过年的，白颜色不吉利，扔了再重做吧！”我心想：“人家花半天工夫做的灯笼就这样扔掉？”忽然，我有了个主意：搬来两个大饮料瓶，把瓶子上红色标签撕下来，贴在上面。
+恰好今天又是爷爷生日，我用自己的零花钱去买了个蛋糕回来给爷爷吃，回到家才发现亲人也来了，只剩下爷爷没来。
+趁爷爷没来的时候，我把蛋糕拿出来插上蜡烛。爷爷来了，祝寿也开始了。一簇簇燃烧的火苗组成一朵吉祥的莲花，映照着爷爷幸福的脸庞，60根彩色的蜡烛也跳动着我们的60个祝福。
+爷爷吹完蜡烛，我们开始分享美味的蛋糕。我灵机一动，把蛋糕上的奶油一下子抹在爷爷的脸上。
+哇噻！爷爷又返老还童了！
+春节拜年对我来讲是一件非�？鞓肥虑�。
+年初二一大早，妈妈就催我起床，说今天要到爷爷、奶奶家拜年。我一听，高兴极了，连忙起床。吃过早饭，穿上新衣服，就和爸爸、妈妈一起坐车前往爷爷家。
+爷爷家在乡下，汽车开了不到半小时就到了。我还没走到爷爷家，爷爷、奶奶就已经在门口等候了。我一看到爷爷、奶奶，就高兴地叫起来了：“爷爷、奶奶，我们来给你们拜年了！”爷爷、奶奶乐呵呵地笑个不停。
+进了爷爷、奶奶家，他们就给我拿了很多好吃东西，有水果、有糖、有花生等等。我一边吃，爷爷一边问我：“学习好不好，有没有进步”。当听说我学习成绩比以前有很大进步时，爷爷高兴地笑了，连连夸我既聪明又懂事，并给了我一个红包。
+我高兴地接过了红包，连说谢谢。但我知道，我与其他同学相比还有很大差距，所以，我暗暗发誓：在新一年里，一定要更加刻苦地学习，提高成绩，缩小与其他同学差距。
+吃过中饭，我们就告别了爷爷、奶奶，坐车回家了。
+拜年对我来说是件非�？鞓肥�。
+年初二，我早早起床，穿上新衣服、新裤子和新鞋子，准备跟爸爸妈妈还有舅舅……去舅爷爷家去拜年，我可开心了。
+舅爷爷家在墱上，就是去贵池方向，很近。在我家门前乘坐了一辆公交车，年初二去拜年人还真多，公交车上连一个空坐位都没有，真是人群拥挤��！我连站地方都没有，还好有爸爸妈妈在我身边。不一会儿就到了墱上，下了车，印入眼帘是一排排房子还有一家最耀眼购物城，妈妈在里面买了些礼物。不远处，就看见舅爷爷笑容满面地和我们打招呼，我脱口而出：“舅爷爷新年好！”舅爷爷说：“新年好！新年好！”说完，就领着我们来到他家，舅爷爷家住在四楼，可把我走气喘吁吁，实在是太累人了。
+一走进舅爷爷家，他们拿来好多好吃，有瓜子、杏仁、松子、葡萄干……，都是我喜欢吃，我一边吃着东西，一边看着电视。他们还问我，学习好不好，有没有进步。
+我们谈著谈著就到吃午饭时间了，我大口大口地吃着，舅妈用一个非常非常小纸杯，给我到了一小杯雪碧，我喝了一口，真是爽极了。
+吃完饭过后，舅奶奶给了我一个红包，祝我好好学习。我高兴地接过红包，说了声“谢谢”，告别了舅奶奶，表舅舅就开着车送我们回家了。
+哇，拜年感觉可真好��！<|im_end|>[-100 * 1]aju kan nggon ku kaie uwong eh. Kabeh enggo handphone and smartphone waie. Hahaha.<|im_end|>
+[INFO:swift] The TrainArguments will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/args.json
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 5120)
+    (layers): ModuleList(
+      (0-39): 40 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
+          (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
+          (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
+          (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((5120,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen3ForCausalLM: 14768.3072M Params (14768.3072M Trainable [100.0000%]), 0.0001M Buffers.
+[WARNING:swift] Using IterableDataset, setting args.dataloader_num_workers to 1.
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-14B-Base/cpt_10lang_mono/0.5B/logging.jsonl
+W0916 00:13:46.258000 135387944871424 torch/distributed/elastic/agent/server/api.py:688] Received Signals.SIGTERM death signal, shutting down workers
+W0916 00:13:46.261000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449887 closing signal SIGTERM
+W0916 00:13:46.262000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449888 closing signal SIGTERM
+W0916 00:13:46.263000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449889 closing signal SIGTERM
+W0916 00:13:46.263000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449890 closing signal SIGTERM
+W0916 00:13:46.263000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449892 closing signal SIGTERM
+W0916 00:13:46.263000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449893 closing signal SIGTERM
+W0916 00:13:46.263000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449894 closing signal SIGTERM
+W0916 00:13:46.263000 135387944871424 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1449896 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
+    result = self._invoke_run(role)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 835, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 79, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 1449808 got signal: 15
+++++ readlink -f cpt_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/cpt_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ model_name=Qwen3-4B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ train_dataset=($ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl)
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl
++ per_device_train_batch_size=25
++ per_device_eval_batch_size=25
++ gradient_accumulation_steps=3
++ max_lengths=2048
++ max_steps=5000
++ task=cpt_10lang_mono
++ tag=0.5B
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ cp cpt_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/train.log
++ swift pt --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000
+[2025-09-16 00:14:31,442] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-16 00:14:38,284] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:14:38,335] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:14:38,616] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 00:14:38,867] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:14:38,890] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:14:38,909] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:14:38,911] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 00:14:38,920] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[2025-09-16 00:14:39,996] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:14:40.847456756 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:14:40,007] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:14:40.858910454 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-16 00:14:40,120] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:14:40,120] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[W916 00:14:40.968566460 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl'],
+dataset_num_proc=1,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=1000.0,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=3,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=2048,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=5000,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=3.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B,
+overwrite_output_dir=False,
+packing=True,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=25,
+per_device_train_batch_size=25,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000.0,
+save_strategy=steps,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=True,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][2025-09-16 00:14:40,948] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:14:40.801226283 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:14:41,153] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:14:41.002710954 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:14:41,184] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:14:41.031698048 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 00:14:41,206] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 00:14:41,209] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 00:14:41.055225398 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 00:14:41.057237419 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:03,  1.67s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:03,  1.64s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:03,  1.66s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.11s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.02it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.08it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.06it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.02s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.32s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.31s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.32s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.09it/s]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] Setting args.use_chat_template: False
+[INFO:swift] Setting args.loss_scale: 'all'
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 2048
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: react_en
+[INFO:swift] Start time of running main: 2025-09-16 00:14:43.334666
+[INFO:swift] swift.__version__: 3.7.3
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.02s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.18s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.27it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.12s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.14s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.20s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.32it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.25it/s]
+[INFO:swift] train_dataset: IterableDataset({
+    features: ['messages'],
+    num_shards: 1
+})
+[INFO:swift] val_dataset: IterableDataset({
+    features: ['messages'],
+    num_shards: 1
+})
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] [INPUT_IDS] [18493, 107167, 105595, 5373, 99257, 5373, 102438, 3837, 104695, 61443, 38182, 104745, 100003, 3837, 104745, 104625, 87335, 101286, 3837, 101897, 103947, 100383, 101118, 3837, 67338, 102064, 99877, 36407, 102124, 46944, 100220, 100240, 9370, 111048, 1773, 100624, 100141, 104745, 107343, 107548, 101036, 11319, 114566, 100452, 105191, 104387, 104197, 105285, 14777, 104745, 22, 99824, 3837, 100437, 101113, 3837, 109477, 100006, 99729, 8997, 26288, 105285, 14777, 104745, 10236, 107, 229, 16, 198, 2073, 53222, 111241, 104444, 3837, 99934, 99528, 49082, 3837, 99621, 16628, 99741, 3837, 50009, 108052, 854, 20412, 101988, 102376, 71268, 100645, 100854, 99195, 108008, 3837, 88308, 117159, 1773, 100632, 3837, 107954, 100090, 7948, 104890, 107935, 8997, 26288, 105285, 108739, 99391, 3837, 35946, 112181, 29490, 102300, 99830, 3837, 99621, 52801, 16628, 99741, 3837, 102231, 29490, 109979, 104781, 3837, 35946, 17447, 95256, 105611, 102440, 101187, 108040, 103167, 33108, 111877, 48921, 17881, 104006, 114339, 3837, 16872, 95256, 105611, 99378, 95256, 9370, 75437, 99413, 102693, 3837, 102586, 99619, 17447, 101347, 100434, 74040, 38035, 9370, 101433, 108040, 103167, 3837, 104402, 49828, 99795, 102020, 3837, 101228, 57566, 99226, 34187, 6313, 107967, 101077, 104334, 3837, 102208, 113305, 30709, 100343, 3837, 104038, 110272, 3837, 69041, 107723, 105883, 6313, 198, 104276, 9370, 35946, 105694, 99226, 34187, 6313, 108907, 117388, 117190, 102376, 9973, 6313, 198, 113827, 3837, 97639, 36587, 36587, 116423, 3837, 101997, 108759, 9370, 104996, 3837, 74763, 62922, 99639, 86117, 99208, 99594, 1773, 102175, 77540, 100484, 104197, 99613, 44636, 26288, 101174, 101355, 3837, 30709, 99808, 99679, 99318, 99318, 9370, 3837, 105611, 101347, 100167, 117638, 99243, 9370, 102430, 119364, 3837, 88051, 99639, 99708, 105664, 99949, 8903, 117464, 9973, 6313, 198, 104020, 26939, 107723, 34187, 3837, 35946, 102313, 29490, 112198, 104853, 39953, 3837, 115232, 108538, 28072, 67279, 3837, 111128, 110961, 105871, 110926, 100090, 7948, 34187, 6313, 198, 101140, 3837, 97639, 99495, 103088, 105943, 45629, 3837, 35946, 104398, 100688, 3837, 100549, 103088, 105943, 36987, 99477, 29524, 111903, 100523, 56006, 110194, 5691, 6313, 99945, 103755, 103755, 119392, 62, 63109, 29490, 103206, 1773, 99745, 99593, 26288, 99360, 100443, 115833, 3837, 110267, 100443, 53222, 105480, 67279, 69249, 3837, 102313, 99226, 34187, 1773, 108954, 5122, 99601, 100659, 104103, 100021, 105953, 6313, 16530, 105153, 99405, 3837, 16530, 105153, 99621, 9370, 3837, 88051, 52801, 8997, 104326, 3837, 97639, 102149, 44991, 101371, 99680, 45629, 3837, 102208, 104494, 26288, 104611, 36987, 100090, 104300, 99495, 75758, 35946, 104398, 102608, 100549, 44991, 101371, 99680, 36987, 99421, 37984, 116322, 75758, 220, 44991, 101371, 99680, 45629, 99350, 101467, 99232, 3837, 113093, 111121, 35946, 8997, 104221, 3837, 97639, 97706, 102149, 101371, 105943, 3837, 40820, 101849, 3837, 40820, 101371, 99680, 14053, 45629, 8997, 100644, 3837, 35946, 104619, 104686, 3837, 104979, 102154, 99350, 6313, 107924, 32847, 6313, 198, 112128, 99595, 100167, 100066, 3837, 102197, 99465, 99641, 99528, 100040, 1773, 33108, 99225, 51827, 99462, 100251, 3837, 108441, 103303, 16628, 101253, 1773, 104596, 108658, 9370, 116467, 69249, 3837, 105786, 117019, 33108, 30709, 103088, 101949, 100018, 107954, 3837, 100678, 36587, 26288, 105285, 104173, 99851, 99567, 9370, 101036, 11319, 14880, 49187, 35946, 101283, 44793, 36407, 8997, 99321, 38953, 26288, 105285, 14777, 101219, 111241, 104444, 70074, 15946, 106517, 3837, 101959, 106235, 101039, 112181, 111505, 99938, 26939, 118184, 3837, 53222, 100451, 30858, 100183, 1773, 35946, 44991, 16872, 75108, 20755, 40820, 99360, 101124, 100451, 30858, 100183, 104089, 33108, 117019, 111435, 105514, 3837, 97639, 105939, 75437, 79599, 32648, 33108, 100451, 30858, 100183, 3837, 99355, 99907, 99907, 9370, 104071, 118184, 3837, 101077, 53222, 8997, 107053, 105786, 35946, 103088, 99818, 99360, 100451, 30858, 100183, 28072, 99793, 3837, 99258, 117019, 111678, 108304, 9370, 13, 109703, 115257, 1773, 97639, 105779, 104525, 99164, 3837, 104995, 38182, 99593, 83031, 3837, 105786, 35946, 103088, 99818, 80158, 109482, 100451, 30858, 100183, 3837, 107053, 100451, 30858, 100183, 99283, 108270, 104291, 3837, 107172, 115257, 99225, 100307, 119607, 99610, 3837, 103952, 102837, 74763, 101067, 100451, 30858, 100183, 9370, 104291, 5691, 11319, 109148, 100254, 24071, 222, 31843, 69, 31207, 104791, 100146, 28311, 97639, 99466, 101421, 71268, 100588, 109412, 3837, 108250, 104267, 99705, 98650, 102114, 34187, 3837, 35946, 116347, 100451, 30858, 100183, 36993, 100228, 34187, 111053, 3837, 63109, 99624, 99624, 9370, 100421, 3837, 44934, 101998, 100195, 100868, 1773, 106210, 3837, 97639, 99466, 91680, 99880, 100451, 30858, 100183, 26232, 104291, 3837, 62922, 112151, 111053, 100484, 1773, 35946, 99212, 101492, 119691, 119680, 107352, 101421, 99236, 100421, 99236, 99234, 3837, 35946, 116578, 102265, 105003, 99631, 17340, 118082, 9370, 99695, 27773, 8997, 18493, 108071, 103947, 2073, 103082, 99317, 854, 15946, 3837, 35946, 103961, 105545, 111343, 106125, 3837, 101959, 104115, 36587, 36987, 111343, 106125, 3837, 110622, 100451, 30858, 100183, 9370, 100843, 99691, 64720, 38212, 3837, 102572, 81264, 107331, 97639, 101077, 106202, 13343, 3837, 104233, 117245, 99518, 114434, 113793, 104140, 105007, 2293, 2073, 100451, 30858, 100183, 99518, 101957, 101957, 104291, 34187, 75758, 2073, 110709, 107172, 99259, 100819, 116221, 3837, 854, 35946, 100180, 114551, 41505, 65101, 99259, 99180, 77959, 101891, 3837, 102636, 99561, 35946, 34187, 5691, 6313, 100458, 122113, 100027, 119128, 99869, 102984, 121285, 35496, 231, 5691, 3837, 29524, 68862, 29258, 99693, 1773, 43288, 101228, 100226, 99851, 104099, 6313, 198, 99487, 100451, 30858, 100183, 104233, 102346, 18947, 99851, 99567, 104197, 105285, 14777, 3837, 106884, 104233, 110586, 3837, 104284, 30534, 104211, 99466, 107954, 13343, 53222, 111241, 104444, 5373, 53222, 100451, 30858, 100183, 33108, 102657, 99849, 102045, 13343, 3837, 60533, 99464, 3837, 62922, 65101, 97639, 99654, 99851, 99567, 8997, 102376, 108659, 101988, 31235, 109815, 103446, 9370, 105832, 1773, 97611, 105686, 101199, 104417, 3837, 99212, 104115, 69041, 99466, 109432, 104417, 9370, 102376, 112322, 100003, 8997, 26288, 7948, 105134, 3837, 104902, 33108, 26288, 100659, 104278, 112181, 9370, 109195, 3837, 99634, 119477, 104334, 3837, 97639, 106138, 99405, 99391, 99938, 34187, 3837, 105859, 100464, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 99305, 118673, 100213, 1773, 106864, 100000, 100186, 82647, 9370, 3837, 99652, 106411, 99610, 42140, 44729, 42140, 99477, 8997, 99405, 106570, 99391, 99938, 3837, 97639, 106138, 99934, 99528, 49082, 34187, 1773, 101140, 99360, 99528, 49082, 98279, 88991, 103958, 3837, 87256, 99360, 100802, 63836, 99934, 17447, 105279, 100773, 111786, 1773, 99934, 99477, 18600, 13343, 3837, 30534, 99805, 99610, 99934, 3837, 51463, 99477, 99495, 8997, 99495, 102172, 77540, 110869, 75061, 3837, 97639, 104134, 71134, 17447, 16628, 102214, 1773, 18493, 102458, 111678, 104444, 102045, 1773, 111678, 33447, 80158, 30440, 99405, 107946, 99938, 34187, 1773, 100655, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 7948, 7948, 18830, 88683, 1773, 97706, 99992, 44793, 99929, 102100, 109861, 39426, 9370, 99800, 3837, 105139, 105863, 101425, 3837, 99652, 106411, 99610, 99832, 85361, 111226, 8997, 99495, 104030, 13343, 3837, 45629, 45629, 17523, 17523, 71268, 53222, 102300, 103850, 1773, 101916, 106519, 105231, 103850, 105484, 3837, 99212, 103850, 116943, 103585, 3837, 101280, 99226, 34187, 3837, 103973, 29732, 16530, 116435, 29077, 3837, 38182, 106570, 102376, 3837, 118581, 99518, 107651, 3837, 105666, 33108, 104902, 111656, 107847, 104432, 104066, 33108, 100134, 15946, 3837, 100549, 99466, 99257, 102088, 3837, 100134, 101300, 8997, 88308, 104197, 105285, 14777, 104037, 100654, 3837, 99519, 91777, 104128, 71618, 112096, 101280, 104197, 100167, 8997, 104301, 35946, 26288, 105285, 14777, 99774, 35727, 3837, 104624, 35946, 104053, 99774, 35727, 8997, 26288, 105285, 14777, 9370, 104030, 3837, 106047, 104071, 106954, 99366, 3837, 105786, 42411, 109216, 36987, 104335, 110138, 114154, 100003, 5691, 6313, 120144, 119557, 122204, 121285, 119773, 121059, 119142, 170, 222, 254, 5691, 8997, 97639, 92133, 104059, 99857, 99525, 110792, 24968, 11622, 102634, 101953, 99360, 63703, 27091, 71268, 101413, 34794, 3837, 114312, 100802, 63836, 1773, 99518, 11622, 72990, 100241, 99190, 2073, 121999, 44729, 854, 99360, 100050, 104957, 18493, 104135, 3837, 100420, 87256, 101358, 106804, 109703, 115257, 3837, 99654, 3837, 103952, 114154, 80158, 19108, 34187, 8997, 106356, 99314, 101180, 3837, 101952, 97639, 99190, 99692, 114154, 36587, 36987, 26288, 38182, 104300, 3837, 99243, 102284, 16530, 108738, 3837, 105603, 34187, 87256, 29258, 99190, 100003, 75758, 35946, 108954, 36987, 104389, 99232, 103554, 111031, 100854, 114154, 106041, 105603, 100373, 81264, 105420, 3837, 35946, 101144, 18947, 110721, 5122, 100561, 36407, 100369, 26288, 106099, 100822, 3837, 99360, 118579, 17447, 104165, 105151, 107495, 100194, 3837, 99934, 18493, 104135, 8997, 113573, 100644, 104458, 106356, 105511, 3837, 35946, 111256, 99822, 108032, 85336, 105275, 18947, 107000, 104150, 89012, 106356, 99405, 3837, 112696, 109785, 108927, 74763, 101161, 3837, 110567, 106356, 70927, 36407, 8997, 103083, 106356, 70927, 36407, 103920, 3837, 110267, 107000, 113121, 101255, 17447, 109703, 115257, 1773, 106356, 101161, 3837, 100549, 100523, 74763, 107651, 1773, 14777, 112594, 112594, 106578, 9370, 79599, 101233, 101286, 114177, 109003, 9370, 110914, 3837, 100261, 99331, 99164, 106356, 101954, 107288, 101625, 3837, 21, 15, 99408, 110350, 9370, 109703, 115257, 74763, 100421, 27733, 99164, 103952, 21, 15, 18947, 105514, 8997, 106356, 102203, 46306, 109703, 115257, 3837, 97639, 55286, 93149, 106800, 9370, 107000, 1773, 35946, 99677, 32648, 14777, 27733, 3837, 99360, 107000, 101913, 118509, 103077, 101537, 18493, 106356, 9370, 104488, 8997, 108668, 121769, 6313, 106356, 99518, 100514, 91777, 97706, 99914, 34187, 6313, 198, 102376, 100090, 7948, 102788, 105523, 107477, 65676, 5691, 11319, 99381, 241, 99971, 100042, 5691, 8997, 105285, 40820, 118666, 3837, 101935, 80158, 101611, 35946, 109195, 3837, 36587, 100644, 30534, 26939, 106356, 5373, 105943, 45629, 100090, 7948, 1773, 35946, 111912, 3837, 102483, 99226, 34187, 3837, 110665, 109195, 1773, 111505, 99391, 99938, 3837, 109585, 16628, 102214, 3837, 80158, 33108, 102208, 5373, 101935, 100018, 99901, 39953, 104374, 106356, 45629, 8997, 106356, 45629, 18493, 99474, 16872, 3837, 100343, 101467, 99828, 110070, 80158, 99495, 1773, 35946, 104789, 104990, 106356, 45629, 3837, 106356, 5373, 105943, 106779, 18493, 102458, 111801, 34187, 1773, 35946, 14777, 101038, 106356, 5373, 105943, 3837, 80158, 102483, 29490, 99882, 109412, 36987, 106356, 5373, 105943, 3837, 97639, 36407, 116416, 100090, 7948, 34187, 75758, 106356, 5373, 105943, 99350, 106397, 29490, 48738, 18947, 101677, 8997, 105480, 106356, 5373, 105943, 45629, 3837, 99650, 80158, 104169, 99674, 104686, 106678, 100413, 3837, 18830, 104618, 5373, 18830, 100443, 5373, 18830, 109378, 104008, 1773, 35946, 104203, 99405, 3837, 106356, 104203, 107557, 36987, 100134, 108702, 3837, 104710, 101300, 55807, 39165, 102654, 35946, 118566, 56006, 103982, 111244, 101300, 13343, 3837, 106356, 102483, 29490, 103206, 3837, 113093, 101651, 35946, 99929, 105414, 99518, 117045, 90395, 115833, 46944, 108052, 8997, 35946, 102483, 29490, 29077, 102346, 108052, 3837, 54926, 36587, 102570, 1773, 77288, 105519, 3837, 35946, 106961, 101181, 101208, 100626, 101235, 104701, 3837, 99999, 3837, 35946, 116091, 28291, 103421, 5122, 18493, 16628, 100695, 69249, 3837, 103962, 101896, 116240, 29490, 100134, 3837, 100627, 100716, 3837, 108287, 106961, 101181, 104701, 8997, 111505, 15946, 99938, 3837, 106235, 108526, 34187, 106356, 5373, 105943, 3837, 99901, 39953, 104122, 34187, 8997, 100090, 7948, 115672, 20412, 14224, 65676, 5691, 11319, 99381, 241, 99971, 5691, 8997, 105285, 40820, 3837, 35946, 112181, 109195, 3837, 109585, 16628, 102214, 5373, 16628, 113233, 33108, 16628, 110383, 3837, 101077, 99557, 110961, 100626, 117999, 14053, 85336, 101849, 106356, 45629, 85336, 100090, 7948, 3837, 35946, 30440, 102313, 34187, 8997, 101849, 106356, 45629, 18493, 42144, 109, 17447, 3837, 99486, 85336, 99582, 100230, 100696, 3837, 99165, 59258, 1773, 102865, 45629, 109967, 106825, 99593, 100408, 107500, 3837, 105285, 40820, 85336, 100090, 101506, 108560, 42140, 3837, 107500, 17447, 54926, 46944, 34794, 99901, 24156, 104338, 3837, 101228, 102248, 115507, 9973, 6313, 35946, 54926, 70790, 100371, 104338, 3837, 108432, 18830, 110961, 102865, 102144, 1773, 16530, 106868, 80158, 99495, 42144, 109, 17447, 3837, 104853, 39953, 3837, 99540, 17254, 99246, 103287, 99639, 59956, 59956, 102199, 97706, 99992, 45629, 31235, 113911, 102297, 59074, 3837, 101935, 111267, 105275, 97084, 105950, 1773, 117222, 3837, 80158, 104356, 101849, 106356, 103348, 99333, 27091, 29490, 33108, 97639, 116657, 3837, 35946, 99694, 39426, 103427, 36987, 101849, 99989, 75107, 91050, 7948, 52801, 75758, 101849, 106356, 36587, 36987, 107924, 52801, 6313, 107924, 52801, 75758, 107484, 3837, 80158, 99213, 99164, 97639, 104071, 42411, 45629, 3837, 101849, 106356, 45629, 106789, 63703, 99432, 3837, 30440, 106488, 99314, 99180, 103425, 103138, 103138, 3837, 116493, 99847, 17340, 34187, 8997, 14777, 104400, 101849, 106356, 45629, 3837, 99650, 113641, 102871, 106678, 3837, 18830, 100857, 44729, 5373, 108202, 102030, 5373, 100180, 44729, 5373, 101580, 99251, 14053, 3837, 100132, 109366, 99405, 3837, 35946, 104203, 99405, 99164, 100413, 3837, 104203, 101952, 100234, 1773, 99650, 97706, 107557, 3837, 100134, 108702, 3837, 104710, 101300, 8997, 97639, 99437, 99610, 99437, 99610, 80158, 26939, 99405, 117371, 20450, 34187, 3837, 35946, 26288, 39426, 26288, 39426, 29490, 99405, 99164, 3837, 101849, 99680, 11622, 46944, 99491, 99491, 30709, 100050, 100749, 3837, 104169, 99495, 108223, 100749, 100167, 102461, 3837, 35946, 100399, 117459, 3837, 101228, 102308, 99226, 34187, 8997, 109327, 99938, 106735, 3837, 101849, 105943, 115833, 46944, 108052, 3837, 100549, 35946, 102231, 100134, 1773, 35946, 102483, 29490, 111682, 108052, 3837, 105317, 70074, 2073, 102570, 33590, 108526, 34187, 101849, 105943, 3837, 20742, 117999, 80158, 113305, 39953, 36605, 97639, 104122, 34187, 8997, 108668, 3837, 100090, 7948, 100681, 30440, 88051, 52801, 9973, 6313, 151645, 44, 51413, 13422, 7777, 11670, 36600, 16502, 645, 37343, 644, 35246, 13, 40436, 2636, 2922, 3346, 1424, 4844, 323, 21511, 10450, 645, 13, 472, 73535, 13, 151645]
+[INFO:swift] [INPUT] 在平凡的学习、工作、生活中，大家都写过作文吧，作文是由文字组成，经过人的思想考虑，通过语言组织来表达一个主题意义的文体。那么一般作文是怎么写的呢？以下是小编精心整理的大年初一作文7篇，欢迎阅读，希望大家能够喜欢。
+大年初一作文 篇1
+“放鞭炮，贴春联，穿新衣，收红包”是每年春节都必须做的几件事，今年也不例外。不过，过年拜年是最有趣的。
+大年初一一早，我早早地起了床，穿好新衣，好好地打扮了一下，我上身穿着白色羊绒衫和黑白相间的小裙子，下身穿着紧身的打底裤，外面套上一件渐变色的羽绒衫，搭配得自然协调，真是美极了！一切都准备好了，爸爸开着小汽车，带着一家人，向老家前进！
+此时的我激动极了！这是我盼望已久的春节��！
+一路上，我们说说笑笑，看看路边的风景，也别是一番风趣。公路两旁的大树高大挺拔，小草绿油油的，穿着一件雪白雪白的棉袄，真是一幅美丽的冬日画卷��！
+终于到老家了，我开心地蹦下了车，拎着手提包，拉着爸爸妈妈的手一起去拜年了！
+首先，我们到了姨奶奶家，我走了过去，祝姨奶奶：“福如东海寿比南山�！币棠棠涕_心地笑了。抓了一大把糖给了我，我把糖放进了包里，开心极了。心想：现在人们的生活水平提高了！不愁吃，不愁穿的，真好。
+接下来，我们去了三姑妈家，爸爸一声大喊：“拜年的到了！”我走了上去祝三姑妈：“财源滚滚！” 三姑妈家乐开了花，连连称赞我。
+随后，我们还去了姑奶奶，二舅，二姑妈……家。
+今天，我收获了很多，同时也很快乐！新年Happy！
+梅花伴雪舞，祥龙迎春归。和光布德泽，万物沐新辉。在这个短暂的寒假里，我和老妈和小姨一家一起过年，为什么说大年初一是惊险的呢？请听我慢慢道来。
+往常大年初一是在鞭炮声中度过，于是我们就计划早早吃过饭到院子里，放孔明灯。我三下五除二把三个孔明灯打开和老妈写下祝福，我们拿着打火机和孔明灯，兴冲冲的来到院子里，准备放。
+只见我和我姨夫把孔明灯提起来，让老妈点燃底部的.蜡烛。我们耐心等待着，大约过了一分钟，我和我姨夫就放开孔明灯，只见孔明灯自己缓缓上升，里面的烛光摇曳著，我们的目光也随着孔明灯的上升�？勺屏覀内f万没想到的是：
+我们大家的心都悬起来了，刚开始的新鲜感也没有了，我生怕孔明灯会烧了电线，心突突的跳，手心里出了汗。此刻，我们大家只希望孔明灯能上升，别停留在电线旁。我那颗忐忑不安的心越跳越快，我都不敢想象惨绝人寰的恶果。
+在全家人的“痴望”中，我突然想起要不要报警，于是我就说：“要不要报警，万一孔明灯的金属丝导电，怎么办？”正当我们准备打电话时，让我们意想不到又欣喜万分的事发生了—“孔明灯又徐徐上升了！”“原来是里面的热空气太少，”我松了一口气，“像热气球一样，吓死我了�！贝蠹叶妓闪艘豢跉�，如释重担。这真是虚惊一场！
+这个孔明灯让我们过了个惊险的大年初一，但也让我们难忘，我也要提醒大家过年时放鞭炮、放孔明灯和别的爆竹时，注意安全，别像我们这样惊险。
+春节是我国每年最盛大隆重的节日。我的家乡处于南方，那我就向大家介绍一下南方的春节习俗吧。
+大年三十，小孩和大人们都要早早的起床，洗漱好了，我们就开始吃早饭了，茶叶蛋是不可少的食物，它象征著团团圆圆。粥也是不可少的，它象征著多子多福。
+吃完了早饭，我们就开始贴春联了。首先把春联移到正确的地方，再把四个角贴上透明胶就行了。贴福字时，要倒著贴，表示福到了。
+到了下午两点多钟，我们就要换上新衣服。在门口点燃炮竹。点燃后就可吃年夜饭了。鱼是不可少���食物，它象征著年年有余。还有一道既营养还可口的菜，那就是玉米粒，它象征著荣华富贵。
+到了晚上时，家家户户都放起了烟花。天空顿时变成了烟花的世界，那烟花绚丽多彩，美丽极了，让人目不暇接，过完了春节，新的一年又开始了，大人和小孩们都进入了紧张的工作和学习中，祝大家工作顺利，学习进步。
+今年的大年初一有点特别，因为老天下起了一场美丽的大雪。
+这就是我大年初一的一天，这也是我快乐的一天。
+大年初一的晚上，弟弟来到我家玩，我和他商量：“咱们来做灯笼吧�！钡艿芤豢诒愦馍�。
+我们找了一个废酒盒子；用剪刀把四面都挖空，留住四个角。又用土办法做“糨子”把纸粘在上面，里面再固定一根蜡烛，这样，我们的灯笼就成功了。
+爷爷走过来，看着我们做好的灯笼说：“大过年的，白颜色不吉利，扔了再重做吧！”我心想：“人家花半天工夫做的灯笼就这样扔掉？”忽然，我有了个主意：搬来两个大饮料瓶，把瓶子上红色标签撕下来，贴在上面。
+恰好今天又是爷爷生日，我用自己的零花钱去买了个蛋糕回来给爷爷吃，回到家才发现亲人也来了，只剩下爷爷没来。
+趁爷爷没来的时候，我把蛋糕拿出来插上蜡烛。爷爷来了，祝寿也开始了。一簇簇燃烧的火苗组成一朵吉祥的莲花，映照着爷爷幸福的脸庞，60根彩色的蜡烛也跳动着我们的60个祝福。
+爷爷吹完蜡烛，我们开始分享美味的蛋糕。我灵机一动，把蛋糕上的奶油一下子抹在爷爷的脸上。
+哇噻！爷爷又返老还童了！
+春节拜年对我来讲是一件非�？鞓肥虑�。
+年初二一大早，妈妈就催我起床，说今天要到爷爷、奶奶家拜年。我一听，高兴极了，连忙起床。吃过早饭，穿上新衣服，就和爸爸、妈妈一起坐车前往爷爷家。
+爷爷家在乡下，汽车开了不到半小时就到了。我还没走到爷爷家，爷爷、奶奶就已经在门口等候了。我一看到爷爷、奶奶，就高兴地叫起来了：“爷爷、奶奶，我们来给你们拜年了！”爷爷、奶奶乐呵呵地笑个不停。
+进了爷爷、奶奶家，他们就给我拿了很多好吃东西，有水果、有糖、有花生等等。我一边吃，爷爷一边问我：“学习好不好，有没有进步”。当听说我学习成绩比以前有很大进步时，爷爷高兴地笑了，连连夸我既聪明又懂事，并给了我一个红包。
+我高兴地接过了红包，连说谢谢。但我知道，我与其他同学相比还有很大差距，所以，我暗暗发誓：在新一年里，一定要更加刻苦地学习，提高成绩，缩小与其他同学差距。
+吃过中饭，我们就告别了爷爷、奶奶，坐车回家了。
+拜年对我来说是件非�？鞓肥�。
+年初二，我早早起床，穿上新衣服、新裤子和新鞋子，准备跟爸爸妈妈还有舅舅……去舅爷爷家去拜年，我可开心了。
+舅爷爷家在墱上，就是去贵池方向，很近。在我家门前乘坐了一辆公交车，年初二去拜年人还真多，公交车上连一个空坐位都没有，真是人群拥挤��！我连站地方都没有，还好有爸爸妈妈在我身边。不一会儿就到了墱上，下了车，印入眼帘是一排排房子还有一家最耀眼购物城，妈妈在里面买了些礼物。不远处，就看见舅爷爷笑容满面地和我们打招呼，我脱口而出：“舅爷爷新年好！”舅爷爷说：“新年好！新年好！”说完，就领着我们来到他家，舅爷爷家住在四楼，可把我走气喘吁吁，实在是太累人了。
+一走进舅爷爷家，他们拿来好多好吃，有瓜子、杏仁、松子、葡萄干……，都是我喜欢吃，我一边吃着东西，一边看着电视。他们还问我，学习好不好，有没有进步。
+我们谈著谈著就到吃午饭时间了，我大口大口地吃着，舅妈用一个非常非常小纸杯，给我到了一小杯雪碧，我喝了一口，真是爽极了。
+吃完饭过后，舅奶奶给了我一个红包，祝我好好学习。我高兴地接过红包，说了声“谢谢”，告别了舅奶奶，表舅舅就开着车送我们回家了。
+哇，拜年感觉可真好��！<|im_end|>Maju kan nggon ku kaie uwong eh. Kabeh enggo handphone and smartphone waie. Hahaha.<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, 107167, 105595, 5373, 99257, 5373, 102438, 3837, 104695, 61443, 38182, 104745, 100003, 3837, 104745, 104625, 87335, 101286, 3837, 101897, 103947, 100383, 101118, 3837, 67338, 102064, 99877, 36407, 102124, 46944, 100220, 100240, 9370, 111048, 1773, 100624, 100141, 104745, 107343, 107548, 101036, 11319, 114566, 100452, 105191, 104387, 104197, 105285, 14777, 104745, 22, 99824, 3837, 100437, 101113, 3837, 109477, 100006, 99729, 8997, 26288, 105285, 14777, 104745, 10236, 107, 229, 16, 198, 2073, 53222, 111241, 104444, 3837, 99934, 99528, 49082, 3837, 99621, 16628, 99741, 3837, 50009, 108052, 854, 20412, 101988, 102376, 71268, 100645, 100854, 99195, 108008, 3837, 88308, 117159, 1773, 100632, 3837, 107954, 100090, 7948, 104890, 107935, 8997, 26288, 105285, 108739, 99391, 3837, 35946, 112181, 29490, 102300, 99830, 3837, 99621, 52801, 16628, 99741, 3837, 102231, 29490, 109979, 104781, 3837, 35946, 17447, 95256, 105611, 102440, 101187, 108040, 103167, 33108, 111877, 48921, 17881, 104006, 114339, 3837, 16872, 95256, 105611, 99378, 95256, 9370, 75437, 99413, 102693, 3837, 102586, 99619, 17447, 101347, 100434, 74040, 38035, 9370, 101433, 108040, 103167, 3837, 104402, 49828, 99795, 102020, 3837, 101228, 57566, 99226, 34187, 6313, 107967, 101077, 104334, 3837, 102208, 113305, 30709, 100343, 3837, 104038, 110272, 3837, 69041, 107723, 105883, 6313, 198, 104276, 9370, 35946, 105694, 99226, 34187, 6313, 108907, 117388, 117190, 102376, 9973, 6313, 198, 113827, 3837, 97639, 36587, 36587, 116423, 3837, 101997, 108759, 9370, 104996, 3837, 74763, 62922, 99639, 86117, 99208, 99594, 1773, 102175, 77540, 100484, 104197, 99613, 44636, 26288, 101174, 101355, 3837, 30709, 99808, 99679, 99318, 99318, 9370, 3837, 105611, 101347, 100167, 117638, 99243, 9370, 102430, 119364, 3837, 88051, 99639, 99708, 105664, 99949, 8903, 117464, 9973, 6313, 198, 104020, 26939, 107723, 34187, 3837, 35946, 102313, 29490, 112198, 104853, 39953, 3837, 115232, 108538, 28072, 67279, 3837, 111128, 110961, 105871, 110926, 100090, 7948, 34187, 6313, 198, 101140, 3837, 97639, 99495, 103088, 105943, 45629, 3837, 35946, 104398, 100688, 3837, 100549, 103088, 105943, 36987, 99477, 29524, 111903, 100523, 56006, 110194, 5691, 6313, 99945, 103755, 103755, 119392, 62, 63109, 29490, 103206, 1773, 99745, 99593, 26288, 99360, 100443, 115833, 3837, 110267, 100443, 53222, 105480, 67279, 69249, 3837, 102313, 99226, 34187, 1773, 108954, 5122, 99601, 100659, 104103, 100021, 105953, 6313, 16530, 105153, 99405, 3837, 16530, 105153, 99621, 9370, 3837, 88051, 52801, 8997, 104326, 3837, 97639, 102149, 44991, 101371, 99680, 45629, 3837, 102208, 104494, 26288, 104611, 36987, 100090, 104300, 99495, 75758, 35946, 104398, 102608, 100549, 44991, 101371, 99680, 36987, 99421, 37984, 116322, 75758, 220, 44991, 101371, 99680, 45629, 99350, 101467, 99232, 3837, 113093, 111121, 35946, 8997, 104221, 3837, 97639, 97706, 102149, 101371, 105943, 3837, 40820, 101849, 3837, 40820, 101371, 99680, 14053, 45629, 8997, 100644, 3837, 35946, 104619, 104686, 3837, 104979, 102154, 99350, 6313, 107924, 32847, 6313, 198, 112128, 99595, 100167, 100066, 3837, 102197, 99465, 99641, 99528, 100040, 1773, 33108, 99225, 51827, 99462, 100251, 3837, 108441, 103303, 16628, 101253, 1773, 104596, 108658, 9370, 116467, 69249, 3837, 105786, 117019, 33108, 30709, 103088, 101949, 100018, 107954, 3837, 100678, 36587, 26288, 105285, 104173, 99851, 99567, 9370, 101036, 11319, 14880, 49187, 35946, 101283, 44793, 36407, 8997, 99321, 38953, 26288, 105285, 14777, 101219, 111241, 104444, 70074, 15946, 106517, 3837, 101959, 106235, 101039, 112181, 111505, 99938, 26939, 118184, 3837, 53222, 100451, 30858, 100183, 1773, 35946, 44991, 16872, 75108, 20755, 40820, 99360, 101124, 100451, 30858, 100183, 104089, 33108, 117019, 111435, 105514, 3837, 97639, 105939, 75437, 79599, 32648, 33108, 100451, 30858, 100183, 3837, 99355, 99907, 99907, 9370, 104071, 118184, 3837, 101077, 53222, 8997, 107053, 105786, 35946, 103088, 99818, 99360, 100451, 30858, 100183, 28072, 99793, 3837, 99258, 117019, 111678, 108304, 9370, 13, 109703, 115257, 1773, 97639, 105779, 104525, 99164, 3837, 104995, 38182, 99593, 83031, 3837, 105786, 35946, 103088, 99818, 80158, 109482, 100451, 30858, 100183, 3837, 107053, 100451, 30858, 100183, 99283, 108270, 104291, 3837, 107172, 115257, 99225, 100307, 119607, 99610, 3837, 103952, 102837, 74763, 101067, 100451, 30858, 100183, 9370, 104291, 5691, 11319, 109148, 100254, 24071, 222, 31843, 69, 31207, 104791, 100146, 28311, 97639, 99466, 101421, 71268, 100588, 109412, 3837, 108250, 104267, 99705, 98650, 102114, 34187, 3837, 35946, 116347, 100451, 30858, 100183, 36993, 100228, 34187, 111053, 3837, 63109, 99624, 99624, 9370, 100421, 3837, 44934, 101998, 100195, 100868, 1773, 106210, 3837, 97639, 99466, 91680, 99880, 100451, 30858, 100183, 26232, 104291, 3837, 62922, 112151, 111053, 100484, 1773, 35946, 99212, 101492, 119691, 119680, 107352, 101421, 99236, 100421, 99236, 99234, 3837, 35946, 116578, 102265, 105003, 99631, 17340, 118082, 9370, 99695, 27773, 8997, 18493, 108071, 103947, 2073, 103082, 99317, 854, 15946, 3837, 35946, 103961, 105545, 111343, 106125, 3837, 101959, 104115, 36587, 36987, 111343, 106125, 3837, 110622, 100451, 30858, 100183, 9370, 100843, 99691, 64720, 38212, 3837, 102572, 81264, 107331, 97639, 101077, 106202, 13343, 3837, 104233, 117245, 99518, 114434, 113793, 104140, 105007, 2293, 2073, 100451, 30858, 100183, 99518, 101957, 101957, 104291, 34187, 75758, 2073, 110709, 107172, 99259, 100819, 116221, 3837, 854, 35946, 100180, 114551, 41505, 65101, 99259, 99180, 77959, 101891, 3837, 102636, 99561, 35946, 34187, 5691, 6313, 100458, 122113, 100027, 119128, 99869, 102984, 121285, 35496, 231, 5691, 3837, 29524, 68862, 29258, 99693, 1773, 43288, 101228, 100226, 99851, 104099, 6313, 198, 99487, 100451, 30858, 100183, 104233, 102346, 18947, 99851, 99567, 104197, 105285, 14777, 3837, 106884, 104233, 110586, 3837, 104284, 30534, 104211, 99466, 107954, 13343, 53222, 111241, 104444, 5373, 53222, 100451, 30858, 100183, 33108, 102657, 99849, 102045, 13343, 3837, 60533, 99464, 3837, 62922, 65101, 97639, 99654, 99851, 99567, 8997, 102376, 108659, 101988, 31235, 109815, 103446, 9370, 105832, 1773, 97611, 105686, 101199, 104417, 3837, 99212, 104115, 69041, 99466, 109432, 104417, 9370, 102376, 112322, 100003, 8997, 26288, 7948, 105134, 3837, 104902, 33108, 26288, 100659, 104278, 112181, 9370, 109195, 3837, 99634, 119477, 104334, 3837, 97639, 106138, 99405, 99391, 99938, 34187, 3837, 105859, 100464, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 99305, 118673, 100213, 1773, 106864, 100000, 100186, 82647, 9370, 3837, 99652, 106411, 99610, 42140, 44729, 42140, 99477, 8997, 99405, 106570, 99391, 99938, 3837, 97639, 106138, 99934, 99528, 49082, 34187, 1773, 101140, 99360, 99528, 49082, 98279, 88991, 103958, 3837, 87256, 99360, 100802, 63836, 99934, 17447, 105279, 100773, 111786, 1773, 99934, 99477, 18600, 13343, 3837, 30534, 99805, 99610, 99934, 3837, 51463, 99477, 99495, 8997, 99495, 102172, 77540, 110869, 75061, 3837, 97639, 104134, 71134, 17447, 16628, 102214, 1773, 18493, 102458, 111678, 104444, 102045, 1773, 111678, 33447, 80158, 30440, 99405, 107946, 99938, 34187, 1773, 100655, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 7948, 7948, 18830, 88683, 1773, 97706, 99992, 44793, 99929, 102100, 109861, 39426, 9370, 99800, 3837, 105139, 105863, 101425, 3837, 99652, 106411, 99610, 99832, 85361, 111226, 8997, 99495, 104030, 13343, 3837, 45629, 45629, 17523, 17523, 71268, 53222, 102300, 103850, 1773, 101916, 106519, 105231, 103850, 105484, 3837, 99212, 103850, 116943, 103585, 3837, 101280, 99226, 34187, 3837, 103973, 29732, 16530, 116435, 29077, 3837, 38182, 106570, 102376, 3837, 118581, 99518, 107651, 3837, 105666, 33108, 104902, 111656, 107847, 104432, 104066, 33108, 100134, 15946, 3837, 100549, 99466, 99257, 102088, 3837, 100134, 101300, 8997, 88308, 104197, 105285, 14777, 104037, 100654, 3837, 99519, 91777, 104128, 71618, 112096, 101280, 104197, 100167, 8997, 104301, 35946, 26288, 105285, 14777, 99774, 35727, 3837, 104624, 35946, 104053, 99774, 35727, 8997, 26288, 105285, 14777, 9370, 104030, 3837, 106047, 104071, 106954, 99366, 3837, 105786, 42411, 109216, 36987, 104335, 110138, 114154, 100003, 5691, 6313, 120144, 119557, 122204, 121285, 119773, 121059, 119142, 170, 222, 254, 5691, 8997, 97639, 92133, 104059, 99857, 99525, 110792, 24968, 11622, 102634, 101953, 99360, 63703, 27091, 71268, 101413, 34794, 3837, 114312, 100802, 63836, 1773, 99518, 11622, 72990, 100241, 99190, 2073, 121999, 44729, 854, 99360, 100050, 104957, 18493, 104135, 3837, 100420, 87256, 101358, 106804, 109703, 115257, 3837, 99654, 3837, 103952, 114154, 80158, 19108, 34187, 8997, 106356, 99314, 101180, 3837, 101952, 97639, 99190, 99692, 114154, 36587, 36987, 26288, 38182, 104300, 3837, 99243, 102284, 16530, 108738, 3837, 105603, 34187, 87256, 29258, 99190, 100003, 75758, 35946, 108954, 36987, 104389, 99232, 103554, 111031, 100854, 114154, 106041, 105603, 100373, 81264, 105420, 3837, 35946, 101144, 18947, 110721, 5122, 100561, 36407, 100369, 26288, 106099, 100822, 3837, 99360, 118579, 17447, 104165, 105151, 107495, 100194, 3837, 99934, 18493, 104135, 8997, 113573, 100644, 104458, 106356, 105511, 3837, 35946, 111256, 99822, 108032, 85336, 105275, 18947, 107000, 104150, 89012, 106356, 99405, 3837, 112696, 109785, 108927, 74763, 101161, 3837, 110567, 106356, 70927, 36407, 8997, 103083, 106356, 70927, 36407, 103920, 3837, 110267, 107000, 113121, 101255, 17447, 109703, 115257, 1773, 106356, 101161, 3837, 100549, 100523, 74763, 107651, 1773, 14777, 112594, 112594, 106578, 9370, 79599, 101233, 101286, 114177, 109003, 9370, 110914, 3837, 100261, 99331, 99164, 106356, 101954, 107288, 101625, 3837, 21, 15, 99408, 110350, 9370, 109703, 115257, 74763, 100421, 27733, 99164, 103952, 21, 15, 18947, 105514, 8997, 106356, 102203, 46306, 109703, 115257, 3837, 97639, 55286, 93149, 106800, 9370, 107000, 1773, 35946, 99677, 32648, 14777, 27733, 3837, 99360, 107000, 101913, 118509, 103077, 101537, 18493, 106356, 9370, 104488, 8997, 108668, 121769, 6313, 106356, 99518, 100514, 91777, 97706, 99914, 34187, 6313, 198, 102376, 100090, 7948, 102788, 105523, 107477, 65676, 5691, 11319, 99381, 241, 99971, 100042, 5691, 8997, 105285, 40820, 118666, 3837, 101935, 80158, 101611, 35946, 109195, 3837, 36587, 100644, 30534, 26939, 106356, 5373, 105943, 45629, 100090, 7948, 1773, 35946, 111912, 3837, 102483, 99226, 34187, 3837, 110665, 109195, 1773, 111505, 99391, 99938, 3837, 109585, 16628, 102214, 3837, 80158, 33108, 102208, 5373, 101935, 100018, 99901, 39953, 104374, 106356, 45629, 8997, 106356, 45629, 18493, 99474, 16872, 3837, 100343, 101467, 99828, 110070, 80158, 99495, 1773, 35946, 104789, 104990, 106356, 45629, 3837, 106356, 5373, 105943, 106779, 18493, 102458, 111801, 34187, 1773, 35946, 14777, 101038, 106356, 5373, 105943, 3837, 80158, 102483, 29490, 99882, 109412, 36987, 106356, 5373, 105943, 3837, 97639, 36407, 116416, 100090, 7948, 34187, 75758, 106356, 5373, 105943, 99350, 106397, 29490, 48738, 18947, 101677, 8997, 105480, 106356, 5373, 105943, 45629, 3837, 99650, 80158, 104169, 99674, 104686, 106678, 100413, 3837, 18830, 104618, 5373, 18830, 100443, 5373, 18830, 109378, 104008, 1773, 35946, 104203, 99405, 3837, 106356, 104203, 107557, 36987, 100134, 108702, 3837, 104710, 101300, 55807, 39165, 102654, 35946, 118566, 56006, 103982, 111244, 101300, 13343, 3837, 106356, 102483, 29490, 103206, 3837, 113093, 101651, 35946, 99929, 105414, 99518, 117045, 90395, 115833, 46944, 108052, 8997, 35946, 102483, 29490, 29077, 102346, 108052, 3837, 54926, 36587, 102570, 1773, 77288, 105519, 3837, 35946, 106961, 101181, 101208, 100626, 101235, 104701, 3837, 99999, 3837, 35946, 116091, 28291, 103421, 5122, 18493, 16628, 100695, 69249, 3837, 103962, 101896, 116240, 29490, 100134, 3837, 100627, 100716, 3837, 108287, 106961, 101181, 104701, 8997, 111505, 15946, 99938, 3837, 106235, 108526, 34187, 106356, 5373, 105943, 3837, 99901, 39953, 104122, 34187, 8997, 100090, 7948, 115672, 20412, 14224, 65676, 5691, 11319, 99381, 241, 99971, 5691, 8997, 105285, 40820, 3837, 35946, 112181, 109195, 3837, 109585, 16628, 102214, 5373, 16628, 113233, 33108, 16628, 110383, 3837, 101077, 99557, 110961, 100626, 117999, 14053, 85336, 101849, 106356, 45629, 85336, 100090, 7948, 3837, 35946, 30440, 102313, 34187, 8997, 101849, 106356, 45629, 18493, 42144, 109, 17447, 3837, 99486, 85336, 99582, 100230, 100696, 3837, 99165, 59258, 1773, 102865, 45629, 109967, 106825, 99593, 100408, 107500, 3837, 105285, 40820, 85336, 100090, 101506, 108560, 42140, 3837, 107500, 17447, 54926, 46944, 34794, 99901, 24156, 104338, 3837, 101228, 102248, 115507, 9973, 6313, 35946, 54926, 70790, 100371, 104338, 3837, 108432, 18830, 110961, 102865, 102144, 1773, 16530, 106868, 80158, 99495, 42144, 109, 17447, 3837, 104853, 39953, 3837, 99540, 17254, 99246, 103287, 99639, 59956, 59956, 102199, 97706, 99992, 45629, 31235, 113911, 102297, 59074, 3837, 101935, 111267, 105275, 97084, 105950, 1773, 117222, 3837, 80158, 104356, 101849, 106356, 103348, 99333, 27091, 29490, 33108, 97639, 116657, 3837, 35946, 99694, 39426, 103427, 36987, 101849, 99989, 75107, 91050, 7948, 52801, 75758, 101849, 106356, 36587, 36987, 107924, 52801, 6313, 107924, 52801, 75758, 107484, 3837, 80158, 99213, 99164, 97639, 104071, 42411, 45629, 3837, 101849, 106356, 45629, 106789, 63703, 99432, 3837, 30440, 106488, 99314, 99180, 103425, 103138, 103138, 3837, 116493, 99847, 17340, 34187, 8997, 14777, 104400, 101849, 106356, 45629, 3837, 99650, 113641, 102871, 106678, 3837, 18830, 100857, 44729, 5373, 108202, 102030, 5373, 100180, 44729, 5373, 101580, 99251, 14053, 3837, 100132, 109366, 99405, 3837, 35946, 104203, 99405, 99164, 100413, 3837, 104203, 101952, 100234, 1773, 99650, 97706, 107557, 3837, 100134, 108702, 3837, 104710, 101300, 8997, 97639, 99437, 99610, 99437, 99610, 80158, 26939, 99405, 117371, 20450, 34187, 3837, 35946, 26288, 39426, 26288, 39426, 29490, 99405, 99164, 3837, 101849, 99680, 11622, 46944, 99491, 99491, 30709, 100050, 100749, 3837, 104169, 99495, 108223, 100749, 100167, 102461, 3837, 35946, 100399, 117459, 3837, 101228, 102308, 99226, 34187, 8997, 109327, 99938, 106735, 3837, 101849, 105943, 115833, 46944, 108052, 3837, 100549, 35946, 102231, 100134, 1773, 35946, 102483, 29490, 111682, 108052, 3837, 105317, 70074, 2073, 102570, 33590, 108526, 34187, 101849, 105943, 3837, 20742, 117999, 80158, 113305, 39953, 36605, 97639, 104122, 34187, 8997, 108668, 3837, 100090, 7948, 100681, 30440, 88051, 52801, 9973, 6313, 151645, -100, 51413, 13422, 7777, 11670, 36600, 16502, 645, 37343, 644, 35246, 13, 40436, 2636, 2922, 3346, 1424, 4844, 323, 21511, 10450, 645, 13, 472, 73535, 13, 151645]
+[INFO:swift] [LABELS] [-100 * 1]平凡的学习、工作、生活中，大家都写过作文吧，作文是由文字组成，经过人的思想考虑，通过语言组织来表达一个主题意义的文体。那么一般作文是怎么写的呢？以下是小编精心整理的大年初一作文7篇，欢迎阅读，希望大家能够喜欢。
+大年初一作文 篇1
+“放鞭炮，贴春联，穿新衣，收红包”是每年春节都必须做的几件事，今年也不例外。不过，过年拜年是最有趣的。
+大年初一一早，我早早地起了床，穿好新衣，好好地打扮了一下，我上身穿着白色羊绒衫和黑白相间的小裙子，下身穿着紧身的打底裤，外面套上一件渐变色的羽绒衫，搭配得自然协调，真是美极了！一切都准备好了，爸爸开着小汽车，带着一家人，向老家前进！
+此时的我激动极了！这是我盼望已久的春节��！
+一路上，我们说说笑笑，看看路边的风景，也别是一番风趣。公路两旁的大树高大挺拔，小草绿油油的，穿着一件雪白雪白的棉袄，真是一幅美丽的冬日画卷��！
+终于到老家了，我开心地蹦下了车，拎着手提包，拉着爸爸妈妈的手一起去拜年了！
+首先，我们到了姨奶奶家，我走了过去，祝姨奶奶：“福如东海寿比南山�！币棠棠涕_心地笑了。抓了一大把糖给了我，我把糖放进了包里，开心极了。心想：现在人们的生活水平提高了！不愁吃，不愁穿的，真好。
+接下来，我们去了三姑妈家，爸爸一声大喊：“拜年的到了！”我走了上去祝三姑妈：“财源滚滚！” 三姑妈家乐开了花，连连称赞我。
+随后，我们还去了姑奶奶，二舅，二姑妈……家。
+今天，我收获了很多，同时也很快乐！新年Happy！
+梅花伴雪舞，祥龙迎春归。和光布德泽，万物沐新辉。在这个短暂的寒假里，我和老妈和小姨一家一起过年，为什么说大年初一是惊险的呢？请听我慢慢道来。
+往常大年初一是在鞭炮声中度过，于是我们就计划早早吃过饭到院子里，放孔明灯。我三下五除二把三个孔明灯打开和老妈写下祝福，我们拿着打火机和孔明灯，兴冲冲的来到院子里，准备放。
+只见我和我姨夫把孔明灯提起来，让老妈点燃底部的.蜡烛。我们耐心等待着，大约过了一分钟，我和我姨夫就放开孔明灯，只见孔明灯自己缓缓上升，里面的烛光摇曳著，我们的目光也随着孔明灯的上升�？勺屏覀内f万没想到的是：
+我们大家的心都悬起来了，刚开始的新鲜感也没有了，我生怕孔明灯会烧了电线，心突突的跳，手心里出了汗。此刻，我们大家只希望孔明灯能上升，别停留在电线旁。我那颗忐忑不安的心越跳越快，我都不敢想象惨绝人寰的恶果。
+在全家人的“痴望”中，我突然想起要不要报警，于是我就说：“要不要报警，万一孔明灯的金属丝导电，怎么办？”正当我们准备打电话时，让我们意想不到又欣喜万分的事发生了—“孔明灯又徐徐上升了！”“原来是里面的热空气太少，”我松了一口气，“像热气球一样，吓死我了�！贝蠹叶妓闪艘豢跉�，如释重担。这真是虚惊一场！
+这个孔明灯让我们过了个惊险的大年初一，但也让我们难忘，我也要提醒大家过年时放鞭炮、放孔明灯和别的爆竹时，注意安全，别像我们这样惊险。
+春节是我国每年最盛大隆重的节日。我的家乡处于南方，那我就向大家介绍一下南方的春节习俗吧。
+大年三十，小孩和大人们都要早早的起床，洗漱好了，我们就开始吃早饭了，茶叶蛋是不可少的食物，它象征著团团圆圆。粥也是不可少的，它象征著多子多福。
+吃完了早饭，我们就开始贴春联了。首先把春联移到正确的地方，再把四个角贴上透明胶就行了。贴福字时，要倒著贴，表示福到了。
+到了下午两点多钟，我们就要换上新衣服。在门口点燃炮竹。点燃后就可吃年夜饭了。鱼是不可少的食物，它象征著年年有余。还有一道既营养还可口的菜，那就是玉米粒，它象征著荣华富贵。
+到了晚上时，家家户户都放起了烟花。天空顿时变成了烟花的世界，那烟花绚丽多彩，美丽极了，让人目不暇接，过完了春节，新的一年又开始了，大人和小孩们都进入了紧张的工作和学习中，祝大家工作顺利，学习进步。
+今年的大年初一有点特别，因为老天下起了一场美丽的大雪。
+这就是我大年初一的一天，这也是我快乐的一天。
+大年初一的晚上，弟弟来到我家玩，我和他商量：“咱们来做灯笼吧�！钡艿芤豢诒愦馍�。
+我们找了一个废酒盒子；用剪刀把四面都挖空，留住四个角。又用土办法做“糨子”把纸粘在上面，里面再固定一根蜡烛，这样，我们的灯笼就成功了。
+爷爷走过来，看着我们做好的灯笼说：“大过年的，白颜色不吉利，扔了再重做吧！”我心想：“人家花半天工夫做的灯笼就这样扔掉？”忽然，我有了个主意：搬来两个大饮料瓶，把瓶子上红色标签撕下来，贴在上面。
+恰好今天又是爷爷生日，我用自己的零花钱去买了个蛋糕回来给爷爷吃，回到家才发现亲人也来了，只剩下爷爷没来。
+趁爷爷没来的时候，我把蛋糕拿出来插上蜡烛。爷爷来了，祝寿也开始了。一簇簇燃烧的火苗组成一朵吉祥的莲花，映照着爷爷幸福的脸庞，60根彩色的蜡烛也跳动着我们的60个祝福。
+爷爷吹完蜡烛，我们开始分享美味的蛋糕。我灵机一动，把蛋糕上的奶油一下子抹在爷爷的脸上。
+哇噻！爷爷又返老还童了！
+春节拜年对我来讲是一件非�？鞓肥虑�。
+年初二一大早，妈妈就催我起床，说今天要到爷爷、奶奶家拜年。我一听，高兴极了，连忙起床。吃过早饭，穿上新衣服，就和爸��、妈妈一起坐车前往爷爷家。
+爷爷家在乡下，汽车开了不到半小时就到了。我还没走到爷爷家，爷爷、奶奶就已经在门口等候了。我一看到爷爷、奶奶，就高兴地叫起来了：“爷爷、奶奶，我们来给你们拜年了！”爷爷、奶奶乐呵呵地笑个不停。
+进了爷爷、奶奶家，他们就给我拿了很多好吃东西，有水果、有糖、有花生等等。我一边吃，爷爷一边问我：“学习好不好，有没有进步”。当听说我学习成绩比以前有很大进步时，爷爷高兴地笑了，连连夸我既聪明又懂事，并给了我一个红包。
+我高兴地接过了红包，连说谢谢。但我知道，我与其他同学相比还有很大差距，所以，我暗暗发誓：在新一年里，一定要更加刻苦地学习，提高成绩，缩小与其他同学差距。
+吃过中饭，我们就告别了爷爷、奶奶，坐车回家了。
+拜年对我来说是件非�？鞓肥�。
+年初二，我早早起床，穿上新衣服、新裤子和新鞋子，准备跟爸爸妈妈还有舅舅……去舅爷爷家去拜年，我可开心了。
+舅爷爷家在墱上，就是去贵池方向，很近。在我家门前乘坐了一辆公交车，年初二去拜年人还真多，公交车上连一个空坐位都没有，真是人群拥挤��！我连站地方都没有，还好有爸爸妈妈在我身边。不一会儿就到了墱上，下了车，印入眼帘是一排排房子还有一家最耀眼购物城，妈妈在里面买了些礼物。不远处，就看见舅爷爷笑容满面地和我们打招呼，我脱口而出：“舅爷爷新年好！”舅爷爷说：“新年好！新年好！”说完，就领着我们来到他家，舅爷爷家住在四楼，可把我走气喘吁吁，实在是太累人了。
+一走进舅爷爷家，他们拿来好多好吃，有瓜子、杏仁、松子、葡萄干……，都是我喜欢吃，我一边吃着东西，一边看着电视。他们还问我，学习好不好，有没有进步。
+我们谈著谈著就到吃午饭时间了，我大口大口地吃着，舅妈用一个非常非常小纸杯，给我到了一小杯雪碧，我喝了一口，真是爽极了。
+吃完饭过后，舅奶奶给了我一个红包，祝我好好学习。我高兴地接过红包，说了声“谢谢”，告别了舅奶奶，表舅舅就开着车送我们回家了。
+哇，拜年感觉可真好��！<|im_end|>[-100 * 1]aju kan nggon ku kaie uwong eh. Kabeh enggo handphone and smartphone waie. Hahaha.<|im_end|>
+[INFO:swift] The TrainArguments will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/args.json
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen3ForCausalLM: 4022.4681M Params (4022.4681M Trainable [100.0000%]), 0.0001M Buffers.
+[WARNING:swift] Using IterableDataset, setting args.dataloader_num_workers to 1.
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/logging.jsonl
+Train:   0%|          | 0/5000 [00:00<?, ?it/s]/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+[INFO:swift] use_logits_to_keep: True
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 1/5000 [00:27<38:47:02, 27.93s/it]                                                          {'loss': 2.10210896, 'token_acc': 0.57490628, 'grad_norm': 0.66473794, 'learning_rate': 8e-08, 'memory(GiB)': 122.92, 'train_speed(iter/s)': 0.0235, 'epoch': 0.0, 'global_step/max_steps': '1/5000', 'percentage': '0.02%', 'elapsed_time': '27s', 'remaining_time': '1d 14h 47m 21s'}
+Train:   0%|          | 1/5000 [00:27<38:47:02, 27.93s/it]Train:   0%|          | 1/5000 [00:27<38:47:02, 27.93s/it]Train:   0%|          | 2/5000 [00:42<27:37:25, 19.90s/it]Train:   0%|          | 3/5000 [00:56<24:03:35, 17.33s/it]Train:   0%|          | 4/5000 [01:10<22:24:41, 16.15s/it]Train:   0%|          | 5/5000 [01:25<21:29:28, 15.49s/it]Train:   0%|          | 6/5000 [01:39<20:56:30, 15.10s/it]Train:   0%|          | 7/5000 [01:53<20:35:24, 14.85s/it]Train:   0%|          | 8/5000 [02:08<20:21:54, 14.69s/it]Train:   0%|          | 9/5000 [02:22<20:11:27, 14.56s/it]Train:   0%|          | 10/5000 [02:36<20:05:30, 14.50s/it]                                                           {'loss': 2.11973826, 'token_acc': 0.57246821, 'grad_norm': 0.64821106, 'learning_rate': 8e-07, 'memory(GiB)': 126.38, 'train_speed(iter/s)': 0.058341, 'epoch': 0.0, 'global_step/max_steps': '10/5000', 'percentage': '0.20%', 'elapsed_time': '2m 36s', 'remaining_time': '21h 43m 56s'}
+Train:   0%|          | 10/5000 [02:36<20:05:30, 14.50s/it]Train:   0%|          | 10/5000 [02:36<20:05:30, 14.50s/it]Train:   0%|          | 11/5000 [02:51<20:00:46, 14.44s/it]Train:   0%|          | 12/5000 [03:05<19:57:41, 14.41s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (133055 > 131072). Running this sequence through the model will result in indexing errors
+Train:   0%|          | 13/5000 [03:19<19:55:03, 14.38s/it]Train:   0%|          | 14/5000 [03:34<19:53:31, 14.36s/it]Train:   0%|          | 15/5000 [03:48<19:51:44, 14.34s/it]Train:   0%|          | 16/5000 [04:02<19:50:49, 14.34s/it]Train:   0%|          | 17/5000 [04:16<19:48:59, 14.32s/it]Train:   0%|          | 18/5000 [04:31<19:48:14, 14.31s/it]Train:   0%|          | 19/5000 [04:45<19:47:33, 14.31s/it]Train:   0%|          | 20/5000 [04:59<19:45:35, 14.28s/it]                                                           {'loss': 2.11791954, 'token_acc': 0.57405664, 'grad_norm': 0.54019088, 'learning_rate': 1.6e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.063612, 'epoch': 0.0, 'global_step/max_steps': '20/5000', 'percentage': '0.40%', 'elapsed_time': '4m 59s', 'remaining_time': '20h 44m 6s'}
+Train:   0%|          | 20/5000 [04:59<19:45:35, 14.28s/it]Train:   0%|          | 20/5000 [04:59<19:45:35, 14.28s/it]Train:   0%|          | 21/5000 [05:14<19:44:54, 14.28s/it]Train:   0%|          | 22/5000 [05:28<19:43:21, 14.26s/it]Train:   0%|          | 23/5000 [05:42<19:43:26, 14.27s/it]Train:   0%|          | 24/5000 [05:56<19:43:13, 14.27s/it]Train:   0%|          | 25/5000 [06:11<19:43:29, 14.27s/it]Train:   1%|          | 26/5000 [06:25<19:43:44, 14.28s/it]Train:   1%|          | 27/5000 [06:39<19:43:03, 14.27s/it]Train:   1%|          | 28/5000 [06:53<19:43:07, 14.28s/it]Train:   1%|          | 29/5000 [07:08<19:42:32, 14.27s/it]Train:   1%|          | 30/5000 [07:22<19:42:04, 14.27s/it]                                                           {'loss': 2.10259705, 'token_acc': 0.56723745, 'grad_norm': 0.43265003, 'learning_rate': 2.4e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.065631, 'epoch': 0.01, 'global_step/max_steps': '30/5000', 'percentage': '0.60%', 'elapsed_time': '7m 22s', 'remaining_time': '20h 21m 43s'}
+Train:   1%|          | 30/5000 [07:22<19:42:04, 14.27s/it]Train:   1%|          | 30/5000 [07:22<19:42:04, 14.27s/it]Train:   1%|          | 31/5000 [07:36<19:42:39, 14.28s/it]Train:   1%|          | 32/5000 [07:51<19:43:15, 14.29s/it]Train:   1%|          | 33/5000 [08:05<19:42:28, 14.28s/it]Train:   1%|          | 34/5000 [08:19<19:41:34, 14.28s/it]Train:   1%|          | 35/5000 [08:33<19:41:12, 14.27s/it]Train:   1%|          | 36/5000 [08:48<19:41:46, 14.28s/it]Train:   1%|          | 37/5000 [09:02<19:41:38, 14.29s/it]Train:   1%|          | 38/5000 [09:16<19:41:11, 14.28s/it]Train:   1%|          | 39/5000 [09:31<19:40:51, 14.28s/it]Train:   1%|          | 40/5000 [09:45<19:41:43, 14.29s/it]                                                           {'loss': 2.1006794, 'token_acc': 0.57251624, 'grad_norm': 0.40444338, 'learning_rate': 3.2e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.066668, 'epoch': 0.01, 'global_step/max_steps': '40/5000', 'percentage': '0.80%', 'elapsed_time': '9m 45s', 'remaining_time': '20h 9m 45s'}
+Train:   1%|          | 40/5000 [09:45<19:41:43, 14.29s/it]Train:   1%|          | 40/5000 [09:45<19:41:43, 14.29s/it]Train:   1%|          | 41/5000 [09:59<19:41:58, 14.30s/it]Train:   1%|          | 42/5000 [10:14<19:42:17, 14.31s/it]Train:   1%|          | 43/5000 [10:28<19:42:13, 14.31s/it]Train:   1%|          | 44/5000 [10:42<19:41:04, 14.30s/it]Train:   1%|          | 45/5000 [10:56<19:40:06, 14.29s/it]Train:   1%|          | 46/5000 [11:11<19:39:06, 14.28s/it]Train:   1%|          | 47/5000 [11:25<19:38:46, 14.28s/it]Train:   1%|          | 48/5000 [11:39<19:38:12, 14.28s/it]Train:   1%|          | 49/5000 [11:53<19:37:36, 14.27s/it]Train:   1%|          | 50/5000 [12:08<19:37:19, 14.27s/it]                                                           {'loss': 2.06914444, 'token_acc': 0.56583606, 'grad_norm': 0.36004195, 'learning_rate': 4e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.067311, 'epoch': 0.01, 'global_step/max_steps': '50/5000', 'percentage': '1.00%', 'elapsed_time': '12m 8s', 'remaining_time': '20h 1m 31s'}
+Train:   1%|          | 50/5000 [12:08<19:37:19, 14.27s/it]Train:   1%|          | 50/5000 [12:08<19:37:19, 14.27s/it]Train:   1%|          | 51/5000 [12:22<19:37:45, 14.28s/it]Train:   1%|          | 52/5000 [12:36<19:36:59, 14.27s/it]Train:   1%|          | 53/5000 [12:51<19:36:52, 14.27s/it]Train:   1%|          | 54/5000 [13:05<19:36:15, 14.27s/it]Train:   1%|          | 55/5000 [13:19<19:34:46, 14.25s/it]Train:   1%|          | 56/5000 [13:33<19:35:12, 14.26s/it]Train:   1%|          | 57/5000 [13:48<19:35:03, 14.26s/it]Train:   1%|          | 58/5000 [14:02<19:35:18, 14.27s/it]Train:   1%|          | 59/5000 [14:16<19:34:50, 14.27s/it]Train:   1%|          | 60/5000 [14:30<19:37:01, 14.30s/it]                                                           {'loss': 2.06273766, 'token_acc': 0.57397553, 'grad_norm': 0.38312691, 'learning_rate': 4.8e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.067752, 'epoch': 0.01, 'global_step/max_steps': '60/5000', 'percentage': '1.20%', 'elapsed_time': '14m 30s', 'remaining_time': '19h 55m 9s'}
+Train:   1%|          | 60/5000 [14:30<19:37:01, 14.30s/it]Train:   1%|          | 60/5000 [14:30<19:37:01, 14.30s/it]Train:   1%|          | 61/5000 [14:45<19:36:22, 14.29s/it]Train:   1%|          | 62/5000 [14:59<19:35:10, 14.28s/it]Train:   1%|▏         | 63/5000 [15:13<19:34:29, 14.27s/it]Train:   1%|▏         | 64/5000 [15:28<19:34:55, 14.28s/it]Train:   1%|▏         | 65/5000 [15:42<19:34:50, 14.28s/it]Train:   1%|▏         | 66/5000 [15:56<19:34:45, 14.29s/it]Train:   1%|▏         | 67/5000 [16:10<19:34:13, 14.28s/it]Train:   1%|▏         | 68/5000 [16:25<19:35:46, 14.30s/it]Train:   1%|▏         | 69/5000 [16:39<19:34:57, 14.30s/it]Train:   1%|▏         | 70/5000 [16:53<19:34:54, 14.30s/it]                                                           {'loss': 2.04535313, 'token_acc': 0.57694867, 'grad_norm': 0.34575719, 'learning_rate': 5.6e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068063, 'epoch': 0.01, 'global_step/max_steps': '70/5000', 'percentage': '1.40%', 'elapsed_time': '16m 53s', 'remaining_time': '19h 50m 3s'}
+Train:   1%|▏         | 70/5000 [16:53<19:34:54, 14.30s/it]Train:   1%|▏         | 70/5000 [16:53<19:34:54, 14.30s/it]Train:   1%|▏         | 71/5000 [17:08<19:33:43, 14.29s/it]Train:   1%|▏         | 72/5000 [17:22<19:34:21, 14.30s/it]Train:   1%|▏         | 73/5000 [17:36<19:33:34, 14.29s/it]Train:   1%|▏         | 74/5000 [17:51<19:34:02, 14.30s/it]Train:   2%|▏         | 75/5000 [18:05<19:33:44, 14.30s/it]Train:   2%|▏         | 76/5000 [18:19<19:32:32, 14.29s/it]Train:   2%|▏         | 77/5000 [18:33<19:31:53, 14.28s/it]Train:   2%|▏         | 78/5000 [18:48<19:32:04, 14.29s/it]Train:   2%|▏         | 79/5000 [19:02<19:31:30, 14.28s/it]Train:   2%|▏         | 80/5000 [19:16<19:30:34, 14.28s/it]                                                           {'loss': 2.03608284, 'token_acc': 0.57698866, 'grad_norm': 0.34838921, 'learning_rate': 6.4e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.0683, 'epoch': 0.02, 'global_step/max_steps': '80/5000', 'percentage': '1.60%', 'elapsed_time': '19m 16s', 'remaining_time': '19h 45m 36s'}
+Train:   2%|▏         | 80/5000 [19:16<19:30:34, 14.28s/it]Train:   2%|▏         | 80/5000 [19:16<19:30:34, 14.28s/it]Train:   2%|▏         | 81/5000 [19:30<19:29:46, 14.27s/it]Train:   2%|▏         | 82/5000 [19:45<19:29:42, 14.27s/it]Train:   2%|▏         | 83/5000 [19:59<19:30:28, 14.28s/it]Train:   2%|▏         | 84/5000 [20:13<19:29:38, 14.28s/it]Train:   2%|▏         | 85/5000 [20:28<19:28:47, 14.27s/it]Train:   2%|▏         | 86/5000 [20:42<19:27:48, 14.26s/it]Train:   2%|▏         | 87/5000 [20:56<19:27:53, 14.26s/it]Train:   2%|▏         | 88/5000 [21:10<19:26:40, 14.25s/it]Train:   2%|▏         | 89/5000 [21:25<19:27:10, 14.26s/it]Train:   2%|▏         | 90/5000 [21:39<19:26:54, 14.26s/it]                                                           {'loss': 2.01797523, 'token_acc': 0.57667686, 'grad_norm': 0.3384656, 'learning_rate': 7.2e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068497, 'epoch': 0.02, 'global_step/max_steps': '90/5000', 'percentage': '1.80%', 'elapsed_time': '21m 39s', 'remaining_time': '19h 41m 24s'}
+Train:   2%|▏         | 90/5000 [21:39<19:26:54, 14.26s/it]Train:   2%|▏         | 90/5000 [21:39<19:26:54, 14.26s/it]Train:   2%|▏         | 91/5000 [21:53<19:27:01, 14.26s/it]Train:   2%|▏         | 92/5000 [22:07<19:26:24, 14.26s/it]Train:   2%|▏         | 93/5000 [22:22<19:26:47, 14.27s/it]Train:   2%|▏         | 94/5000 [22:36<19:25:56, 14.26s/it]Train:   2%|▏         | 95/5000 [22:50<19:24:48, 14.25s/it]Train:   2%|▏         | 96/5000 [23:04<19:24:06, 14.24s/it]Train:   2%|▏         | 97/5000 [23:19<19:23:57, 14.24s/it]Train:   2%|▏         | 98/5000 [23:33<19:22:57, 14.23s/it]Train:   2%|▏         | 99/5000 [23:47<19:23:00, 14.24s/it]Train:   2%|▏         | 100/5000 [24:01<19:23:41, 14.25s/it]                                                            {'loss': 2.0218811, 'token_acc': 0.57699347, 'grad_norm': 0.34561437, 'learning_rate': 8e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068662, 'epoch': 0.02, 'global_step/max_steps': '100/5000', 'percentage': '2.00%', 'elapsed_time': '24m 1s', 'remaining_time': '19h 37m 27s'}
+Train:   2%|▏         | 100/5000 [24:01<19:23:41, 14.25s/it]Train:   2%|▏         | 100/5000 [24:01<19:23:41, 14.25s/it]Train:   2%|▏         | 101/5000 [24:15<19:22:38, 14.24s/it]Train:   2%|▏         | 102/5000 [24:30<19:23:19, 14.25s/it]Train:   2%|▏         | 103/5000 [24:44<19:23:22, 14.25s/it]Train:   2%|▏         | 104/5000 [24:58<19:23:20, 14.26s/it]Train:   2%|▏         | 105/5000 [25:13<19:22:42, 14.25s/it]Train:   2%|▏         | 106/5000 [25:27<19:22:13, 14.25s/it]Train:   2%|▏         | 107/5000 [25:41<19:22:10, 14.25s/it]Train:   2%|▏         | 108/5000 [25:55<19:22:50, 14.26s/it]Train:   2%|▏         | 109/5000 [26:10<19:21:50, 14.25s/it]Train:   2%|▏         | 110/5000 [26:24<19:21:21, 14.25s/it]                                                            {'loss': 1.98817062, 'token_acc': 0.58024266, 'grad_norm': 0.35130295, 'learning_rate': 8.8e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068796, 'epoch': 0.02, 'global_step/max_steps': '110/5000', 'percentage': '2.20%', 'elapsed_time': '26m 24s', 'remaining_time': '19h 33m 49s'}
+Train:   2%|▏         | 110/5000 [26:24<19:21:21, 14.25s/it]Train:   2%|▏         | 110/5000 [26:24<19:21:21, 14.25s/it]Train:   2%|▏         | 111/5000 [26:38<19:22:21, 14.26s/it]Train:   2%|▏         | 112/5000 [26:52<19:22:32, 14.27s/it]Train:   2%|▏         | 113/5000 [27:07<19:22:23, 14.27s/it]Train:   2%|▏         | 114/5000 [27:21<19:22:17, 14.27s/it]Train:   2%|▏         | 115/5000 [27:35<19:22:49, 14.28s/it]Train:   2%|▏         | 116/5000 [27:50<19:22:37, 14.28s/it]Train:   2%|▏         | 117/5000 [28:04<19:21:33, 14.27s/it]Train:   2%|▏         | 118/5000 [28:18<19:21:59, 14.28s/it]Train:   2%|▏         | 119/5000 [28:32<19:22:54, 14.30s/it]Train:   2%|▏         | 120/5000 [28:47<19:22:07, 14.29s/it]                                                            {'loss': 1.97514191, 'token_acc': 0.57968566, 'grad_norm': 0.35325646, 'learning_rate': 9.6e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068894, 'epoch': 0.02, 'global_step/max_steps': '120/5000', 'percentage': '2.40%', 'elapsed_time': '28m 47s', 'remaining_time': '19h 30m 38s'}
+Train:   2%|▏         | 120/5000 [28:47<19:22:07, 14.29s/it]Train:   2%|▏         | 120/5000 [28:47<19:22:07, 14.29s/it]Train:   2%|▏         | 121/5000 [29:01<19:21:52, 14.29s/it]Train:   2%|▏         | 122/5000 [29:15<19:22:24, 14.30s/it]Train:   2%|▏         | 123/5000 [29:30<19:22:17, 14.30s/it]Train:   2%|▏         | 124/5000 [29:44<19:21:26, 14.29s/it]Train:   2%|▎         | 125/5000 [29:58<19:19:53, 14.28s/it]Train:   3%|▎         | 126/5000 [30:12<19:19:30, 14.27s/it]Train:   3%|▎         | 127/5000 [30:27<19:19:15, 14.27s/it]Train:   3%|▎         | 128/5000 [30:41<19:19:56, 14.28s/it]Train:   3%|▎         | 129/5000 [30:55<19:18:23, 14.27s/it]Train:   3%|▎         | 130/5000 [31:09<19:18:50, 14.28s/it]                                                            {'loss': 1.98410797, 'token_acc': 0.58540449, 'grad_norm': 0.38012269, 'learning_rate': 1.04e-05, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.06898, 'epoch': 0.03, 'global_step/max_steps': '130/5000', 'percentage': '2.60%', 'elapsed_time': '31m 9s', 'remaining_time': '19h 27m 32s'}
+Train:   3%|▎         | 130/5000 [31:09<19:18:50, 14.28s/it]Train:   3%|▎         | 130/5000 [31:09<19:18:50, 14.28s/it]Train:   3%|▎         | 131/5000 [31:24<19:18:13, 14.27s/it]Train:   3%|▎         | 132/5000 [31:38<19:18:12, 14.28s/it]Train:   3%|▎         | 133/5000 [31:52<19:17:53, 14.27s/it]Train:   3%|▎         | 134/5000 [32:07<19:16:37, 14.26s/it]Train:   3%|▎         | 135/5000 [32:21<19:16:49, 14.27s/it]Train:   3%|▎         | 136/5000 [32:35<19:16:58, 14.27s/it]Train:   3%|▎         | 137/5000 [32:49<19:16:33, 14.27s/it]Train:   3%|▎         | 138/5000 [33:04<19:16:03, 14.27s/it]Train:   3%|▎         | 139/5000 [33:18<19:15:19, 14.26s/it]Train:   3%|▎         | 140/5000 [33:32<19:16:03, 14.27s/it]                                                            {'loss': 1.96440849, 'token_acc': 0.58755211, 'grad_norm': 0.41294757, 'learning_rate': 1.12e-05, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.069058, 'epoch': 0.03, 'global_step/max_steps': '140/5000', 'percentage': '2.80%', 'elapsed_time': '33m 32s', 'remaining_time': '19h 24m 28s'}
+Train:   3%|▎         | 140/5000 [33:32<19:16:03, 14.27s/it]Train:   3%|▎         | 140/5000 [33:32<19:16:03, 14.27s/it]Train:   3%|▎         | 141/5000 [33:46<19:15:29, 14.27s/it]Train:   3%|▎         | 142/5000 [34:01<19:14:47, 14.26s/it]Train:   3%|▎         | 143/5000 [34:15<19:14:13, 14.26s/it]Train:   3%|▎         | 144/5000 [34:29<19:14:04, 14.26s/it]Train:   3%|▎         | 145/5000 [34:43<19:14:25, 14.27s/it]Train:   3%|▎         | 146/5000 [34:58<19:14:32, 14.27s/it]Train:   3%|▎         | 147/5000 [35:12<19:14:18, 14.27s/it]Train:   3%|▎         | 148/5000 [35:26<19:13:28, 14.26s/it]Train:   3%|▎         | 149/5000 [35:41<19:14:13, 14.28s/it]Train:   3%|▎         | 150/5000 [35:55<19:13:30, 14.27s/it]                                                            {'loss': 1.94323387, 'token_acc': 0.58965816, 'grad_norm': 0.42667764, 'learning_rate': 1.2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069126, 'epoch': 0.03, 'global_step/max_steps': '150/5000', 'percentage': '3.00%', 'elapsed_time': '35m 55s', 'remaining_time': '19h 21m 28s'}
+Train:   3%|▎         | 150/5000 [35:55<19:13:30, 14.27s/it]Train:   3%|▎         | 150/5000 [35:55<19:13:30, 14.27s/it]Train:   3%|▎         | 151/5000 [36:09<19:13:52, 14.28s/it]Train:   3%|▎         | 152/5000 [36:23<19:12:36, 14.27s/it]Train:   3%|▎         | 153/5000 [36:38<19:12:32, 14.27s/it]Train:   3%|▎         | 154/5000 [36:52<19:12:49, 14.27s/it]Train:   3%|▎         | 155/5000 [37:06<19:14:32, 14.30s/it]Train:   3%|▎         | 156/5000 [37:21<19:14:10, 14.30s/it]Train:   3%|▎         | 157/5000 [37:35<19:13:56, 14.30s/it]Train:   3%|▎         | 158/5000 [37:49<19:13:07, 14.29s/it]Train:   3%|▎         | 159/5000 [38:03<19:12:45, 14.29s/it]Train:   3%|▎         | 160/5000 [38:18<19:12:01, 14.28s/it]                                                            {'loss': 1.95848866, 'token_acc': 0.58948284, 'grad_norm': 0.4178963, 'learning_rate': 1.28e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.06918, 'epoch': 0.03, 'global_step/max_steps': '160/5000', 'percentage': '3.20%', 'elapsed_time': '38m 18s', 'remaining_time': '19h 18m 40s'}
+Train:   3%|▎         | 160/5000 [38:18<19:12:01, 14.28s/it]Train:   3%|▎         | 160/5000 [38:18<19:12:01, 14.28s/it]Train:   3%|▎         | 161/5000 [38:32<19:12:26, 14.29s/it]Train:   3%|▎         | 162/5000 [38:46<19:12:27, 14.29s/it]Train:   3%|▎         | 163/5000 [39:01<19:11:51, 14.29s/it]Train:   3%|▎         | 164/5000 [39:15<19:10:25, 14.27s/it]Train:   3%|▎         | 165/5000 [39:29<19:09:24, 14.26s/it]Train:   3%|▎         | 166/5000 [39:43<19:08:42, 14.26s/it]Train:   3%|▎         | 167/5000 [39:58<19:08:17, 14.26s/it]Train:   3%|▎         | 168/5000 [40:12<19:09:13, 14.27s/it]Train:   3%|▎         | 169/5000 [40:26<19:08:51, 14.27s/it]Train:   3%|▎         | 170/5000 [40:40<19:09:20, 14.28s/it]                                                            {'loss': 1.94442997, 'token_acc': 0.58962605, 'grad_norm': 0.36368716, 'learning_rate': 1.36e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069231, 'epoch': 0.03, 'global_step/max_steps': '170/5000', 'percentage': '3.40%', 'elapsed_time': '40m 40s', 'remaining_time': '19h 15m 50s'}
+Train:   3%|▎         | 170/5000 [40:40<19:09:20, 14.28s/it]Train:   3%|▎         | 170/5000 [40:40<19:09:20, 14.28s/it]Train:   3%|▎         | 171/5000 [40:55<19:10:03, 14.29s/it]Train:   3%|▎         | 172/5000 [41:09<19:09:56, 14.29s/it]Train:   3%|▎         | 173/5000 [41:23<19:09:43, 14.29s/it]Train:   3%|▎         | 174/5000 [41:38<19:10:32, 14.30s/it]Train:   4%|▎         | 175/5000 [41:52<19:09:58, 14.30s/it]Train:   4%|▎         | 176/5000 [42:06<19:10:21, 14.31s/it]Train:   4%|▎         | 177/5000 [42:21<19:10:03, 14.31s/it]Train:   4%|▎         | 178/5000 [42:35<19:09:17, 14.30s/it]Train:   4%|▎         | 179/5000 [42:49<19:08:24, 14.29s/it]Train:   4%|▎         | 180/5000 [43:03<19:07:42, 14.29s/it]                                                            {'loss': 1.94179916, 'token_acc': 0.58966345, 'grad_norm': 0.35477984, 'learning_rate': 1.44e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.06927, 'epoch': 0.04, 'global_step/max_steps': '180/5000', 'percentage': '3.60%', 'elapsed_time': '43m 3s', 'remaining_time': '19h 13m 11s'}
+Train:   4%|▎         | 180/5000 [43:03<19:07:42, 14.29s/it]Train:   4%|▎         | 180/5000 [43:03<19:07:42, 14.29s/it]Train:   4%|▎         | 181/5000 [43:18<19:06:43, 14.28s/it]Train:   4%|▎         | 182/5000 [43:32<19:06:48, 14.28s/it]Train:   4%|▎         | 183/5000 [43:46<19:07:00, 14.29s/it]Train:   4%|▎         | 184/5000 [44:01<19:06:50, 14.29s/it]Train:   4%|▎         | 185/5000 [44:15<19:06:33, 14.29s/it]Train:   4%|▎         | 186/5000 [44:29<19:06:54, 14.29s/it]Train:   4%|▎         | 187/5000 [44:43<19:06:23, 14.29s/it]Train:   4%|▍         | 188/5000 [44:58<19:05:48, 14.29s/it]Train:   4%|▍         | 189/5000 [45:12<19:04:54, 14.28s/it]Train:   4%|▍         | 190/5000 [45:26<19:04:44, 14.28s/it]                                                            {'loss': 1.92637177, 'token_acc': 0.59283101, 'grad_norm': 0.31059563, 'learning_rate': 1.52e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069309, 'epoch': 0.04, 'global_step/max_steps': '190/5000', 'percentage': '3.80%', 'elapsed_time': '45m 26s', 'remaining_time': '19h 10m 29s'}
+Train:   4%|▍         | 190/5000 [45:26<19:04:44, 14.28s/it]Train:   4%|▍         | 190/5000 [45:26<19:04:44, 14.28s/it]Train:   4%|▍         | 191/5000 [45:41<19:04:26, 14.28s/it]Train:   4%|▍         | 192/5000 [45:55<19:05:00, 14.29s/it]Train:   4%|▍         | 193/5000 [46:09<19:04:05, 14.28s/it]Train:   4%|▍         | 194/5000 [46:23<19:03:26, 14.28s/it]Train:   4%|▍         | 195/5000 [46:38<19:05:17, 14.30s/it]Train:   4%|▍         | 196/5000 [46:52<19:03:36, 14.28s/it]Train:   4%|▍         | 197/5000 [47:06<19:03:58, 14.29s/it]Train:   4%|▍         | 198/5000 [47:21<19:04:15, 14.30s/it]Train:   4%|▍         | 199/5000 [47:35<19:03:41, 14.29s/it]Train:   4%|▍         | 200/5000 [47:49<19:03:16, 14.29s/it]                                                            {'loss': 1.92326355, 'token_acc': 0.58659282, 'grad_norm': 0.29605806, 'learning_rate': 1.6e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069342, 'epoch': 0.04, 'global_step/max_steps': '200/5000', 'percentage': '4.00%', 'elapsed_time': '47m 49s', 'remaining_time': '19h 7m 51s'}
+Train:   4%|▍         | 200/5000 [47:49<19:03:16, 14.29s/it]Train:   4%|▍         | 200/5000 [47:49<19:03:16, 14.29s/it]Train:   4%|▍         | 201/5000 [48:03<19:02:40, 14.29s/it]Train:   4%|▍         | 202/5000 [48:18<19:02:56, 14.29s/it]Train:   4%|▍         | 203/5000 [48:32<19:03:17, 14.30s/it]Train:   4%|▍         | 204/5000 [48:46<19:02:54, 14.30s/it]Train:   4%|▍         | 205/5000 [49:01<19:02:35, 14.30s/it]Train:   4%|▍         | 206/5000 [49:15<19:02:15, 14.30s/it]Train:   4%|▍         | 207/5000 [49:29<19:02:25, 14.30s/it]Train:   4%|▍         | 208/5000 [49:44<19:01:14, 14.29s/it]Train:   4%|▍         | 209/5000 [49:58<19:00:37, 14.28s/it]Train:   4%|▍         | 210/5000 [50:12<18:59:50, 14.28s/it]                                                            {'loss': 1.90565643, 'token_acc': 0.59130438, 'grad_norm': 0.30628449, 'learning_rate': 1.68e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069372, 'epoch': 0.04, 'global_step/max_steps': '210/5000', 'percentage': '4.20%', 'elapsed_time': '50m 12s', 'remaining_time': '19h 5m 14s'}
+Train:   4%|▍         | 210/5000 [50:12<18:59:50, 14.28s/it]Train:   4%|▍         | 210/5000 [50:12<18:59:50, 14.28s/it]Train:   4%|▍         | 211/5000 [50:26<18:58:46, 14.27s/it]Train:   4%|▍         | 212/5000 [50:41<18:59:54, 14.28s/it]Train:   4%|▍         | 213/5000 [50:55<18:59:18, 14.28s/it]Train:   4%|▍         | 214/5000 [51:09<18:58:43, 14.28s/it]Train:   4%|▍         | 215/5000 [51:23<18:58:15, 14.27s/it]Train:   4%|▍         | 216/5000 [51:38<18:57:58, 14.27s/it]Train:   4%|▍         | 217/5000 [51:52<18:56:52, 14.26s/it]Train:   4%|▍         | 218/5000 [52:06<18:57:06, 14.27s/it]Train:   4%|▍         | 219/5000 [52:20<18:56:23, 14.26s/it]Train:   4%|▍         | 220/5000 [52:35<18:57:31, 14.28s/it]                                                            {'loss': 1.90498257, 'token_acc': 0.59482406, 'grad_norm': 0.3018851, 'learning_rate': 1.76e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069403, 'epoch': 0.04, 'global_step/max_steps': '220/5000', 'percentage': '4.40%', 'elapsed_time': '52m 35s', 'remaining_time': '19h 2m 35s'}
+Train:   4%|▍         | 220/5000 [52:35<18:57:31, 14.28s/it]Train:   4%|▍         | 220/5000 [52:35<18:57:31, 14.28s/it]Train:   4%|▍         | 221/5000 [52:49<18:56:23, 14.27s/it]Train:   4%|▍         | 222/5000 [53:03<18:55:28, 14.26s/it]Train:   4%|▍         | 223/5000 [53:18<18:55:37, 14.26s/it]Train:   4%|▍         | 224/5000 [53:32<18:54:24, 14.25s/it]Train:   4%|▍         | 225/5000 [53:46<18:55:22, 14.27s/it]Train:   5%|▍         | 226/5000 [54:00<18:56:06, 14.28s/it]Train:   5%|▍         | 227/5000 [54:15<18:55:37, 14.28s/it]Train:   5%|▍         | 228/5000 [54:29<18:54:46, 14.27s/it]Train:   5%|▍         | 229/5000 [54:43<18:54:46, 14.27s/it]Train:   5%|▍         | 230/5000 [54:57<18:53:44, 14.26s/it]                                                            {'loss': 1.90374413, 'token_acc': 0.59539596, 'grad_norm': 0.3228516, 'learning_rate': 1.84e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069434, 'epoch': 0.05, 'global_step/max_steps': '230/5000', 'percentage': '4.60%', 'elapsed_time': '54m 57s', 'remaining_time': '18h 59m 55s'}
+Train:   5%|▍         | 230/5000 [54:57<18:53:44, 14.26s/it]Train:   5%|▍         | 230/5000 [54:57<18:53:44, 14.26s/it]Train:   5%|▍         | 231/5000 [55:12<18:54:08, 14.27s/it]Train:   5%|▍         | 232/5000 [55:26<18:53:29, 14.26s/it]Train:   5%|▍         | 233/5000 [55:40<18:54:07, 14.27s/it]Train:   5%|▍         | 234/5000 [55:54<18:53:13, 14.27s/it]Train:   5%|▍         | 235/5000 [56:09<18:52:39, 14.26s/it]Train:   5%|▍         | 236/5000 [56:23<18:53:08, 14.27s/it]Train:   5%|▍         | 237/5000 [56:37<18:52:49, 14.27s/it]Train:   5%|▍         | 238/5000 [56:52<18:51:34, 14.26s/it]Train:   5%|▍         | 239/5000 [57:06<18:51:45, 14.26s/it]Train:   5%|▍         | 240/5000 [57:20<18:51:03, 14.26s/it]                                                            {'loss': 1.89821053, 'token_acc': 0.59505033, 'grad_norm': 0.3092941, 'learning_rate': 1.92e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069462, 'epoch': 0.05, 'global_step/max_steps': '240/5000', 'percentage': '4.80%', 'elapsed_time': '57m 20s', 'remaining_time': '18h 57m 17s'}
+Train:   5%|▍         | 240/5000 [57:20<18:51:03, 14.26s/it]Train:   5%|▍         | 240/5000 [57:20<18:51:03, 14.26s/it]Train:   5%|▍         | 241/5000 [57:34<18:51:02, 14.26s/it]Train:   5%|▍         | 242/5000 [57:49<18:51:36, 14.27s/it]Train:   5%|▍         | 243/5000 [58:03<18:51:18, 14.27s/it]Train:   5%|▍         | 244/5000 [58:17<18:51:29, 14.27s/it]Train:   5%|▍         | 245/5000 [58:31<18:51:39, 14.28s/it]Train:   5%|▍         | 246/5000 [58:46<18:51:34, 14.28s/it]Train:   5%|▍         | 247/5000 [59:00<18:51:23, 14.28s/it]Train:   5%|▍         | 248/5000 [59:14<18:50:04, 14.27s/it]Train:   5%|▍         | 249/5000 [59:28<18:49:07, 14.26s/it]Train:   5%|▌         | 250/5000 [59:43<18:48:46, 14.26s/it]                                                            {'loss': 1.89023781, 'token_acc': 0.59163101, 'grad_norm': 0.34664196, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069486, 'epoch': 0.05, 'global_step/max_steps': '250/5000', 'percentage': '5.00%', 'elapsed_time': '59m 43s', 'remaining_time': '18h 54m 41s'}
+Train:   5%|▌         | 250/5000 [59:43<18:48:46, 14.26s/it]Train:   5%|▌         | 250/5000 [59:43<18:48:46, 14.26s/it]Train:   5%|▌         | 251/5000 [59:57<18:49:14, 14.27s/it]Train:   5%|▌         | 252/5000 [1:00:11<18:49:24, 14.27s/it]Train:   5%|▌         | 253/5000 [1:00:26<18:49:07, 14.27s/it]Train:   5%|▌         | 254/5000 [1:00:40<18:49:25, 14.28s/it]Train:   5%|▌         | 255/5000 [1:00:54<18:48:45, 14.27s/it]Train:   5%|▌         | 256/5000 [1:01:08<18:48:47, 14.28s/it]Train:   5%|▌         | 257/5000 [1:01:23<18:48:48, 14.28s/it]Train:   5%|▌         | 258/5000 [1:01:37<18:48:42, 14.28s/it]Train:   5%|▌         | 259/5000 [1:01:51<18:48:07, 14.28s/it]Train:   5%|▌         | 260/5000 [1:02:06<18:47:25, 14.27s/it]                                                              {'loss': 1.88667011, 'token_acc': 0.59444467, 'grad_norm': 0.30647528, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069507, 'epoch': 0.05, 'global_step/max_steps': '260/5000', 'percentage': '5.20%', 'elapsed_time': '1h 2m 6s', 'remaining_time': '18h 52m 8s'}
+Train:   5%|▌         | 260/5000 [1:02:06<18:47:25, 14.27s/it]Train:   5%|▌         | 260/5000 [1:02:06<18:47:25, 14.27s/it]Train:   5%|▌         | 261/5000 [1:02:20<18:47:45, 14.28s/it]Train:   5%|▌         | 262/5000 [1:02:34<18:47:41, 14.28s/it]Train:   5%|▌         | 263/5000 [1:02:48<18:47:10, 14.28s/it]Train:   5%|▌         | 264/5000 [1:03:03<18:47:18, 14.28s/it]Train:   5%|▌         | 265/5000 [1:03:17<18:47:10, 14.28s/it]Train:   5%|▌         | 266/5000 [1:03:31<18:47:03, 14.28s/it]Train:   5%|▌         | 267/5000 [1:03:46<18:47:57, 14.30s/it]Train:   5%|▌         | 268/5000 [1:04:00<18:47:52, 14.30s/it]Train:   5%|▌         | 269/5000 [1:04:14<18:47:18, 14.30s/it]Train:   5%|▌         | 270/5000 [1:04:28<18:45:58, 14.28s/it]                                                              {'loss': 1.88848038, 'token_acc': 0.59465268, 'grad_norm': 0.299853, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069524, 'epoch': 0.05, 'global_step/max_steps': '270/5000', 'percentage': '5.40%', 'elapsed_time': '1h 4m 28s', 'remaining_time': '18h 49m 37s'}
+Train:   5%|▌         | 270/5000 [1:04:28<18:45:58, 14.28s/it]Train:   5%|▌         | 270/5000 [1:04:28<18:45:58, 14.28s/it]Train:   5%|▌         | 271/5000 [1:04:43<18:45:34, 14.28s/it]Train:   5%|▌         | 272/5000 [1:04:57<18:45:01, 14.28s/it]Train:   5%|▌         | 273/5000 [1:05:11<18:44:29, 14.27s/it]Train:   5%|▌         | 274/5000 [1:05:25<18:44:18, 14.27s/it]Train:   6%|▌         | 275/5000 [1:05:40<18:43:14, 14.26s/it]Train:   6%|▌         | 276/5000 [1:05:54<18:45:22, 14.29s/it]Train:   6%|▌         | 277/5000 [1:06:08<18:45:24, 14.30s/it]Train:   6%|▌         | 278/5000 [1:06:23<18:45:45, 14.30s/it]Train:   6%|▌         | 279/5000 [1:06:37<18:45:12, 14.30s/it]Train:   6%|▌         | 280/5000 [1:06:51<18:44:33, 14.30s/it]                                                              {'loss': 1.88816566, 'token_acc': 0.59985999, 'grad_norm': 0.3256793, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069541, 'epoch': 0.06, 'global_step/max_steps': '280/5000', 'percentage': '5.60%', 'elapsed_time': '1h 6m 51s', 'remaining_time': '18h 47m 7s'}
+Train:   6%|▌         | 280/5000 [1:06:51<18:44:33, 14.30s/it]Train:   6%|▌         | 280/5000 [1:06:51<18:44:33, 14.30s/it]Train:   6%|▌         | 281/5000 [1:07:06<18:44:14, 14.29s/it]Train:   6%|▌         | 282/5000 [1:07:20<18:43:30, 14.29s/it]Train:   6%|▌         | 283/5000 [1:07:34<18:43:29, 14.29s/it]Train:   6%|▌         | 284/5000 [1:07:48<18:42:57, 14.29s/it]Train:   6%|▌         | 285/5000 [1:08:03<18:41:59, 14.28s/it]Train:   6%|▌         | 286/5000 [1:08:17<18:41:50, 14.28s/it]Train:   6%|▌         | 287/5000 [1:08:31<18:42:23, 14.29s/it]Train:   6%|▌         | 288/5000 [1:08:46<18:41:11, 14.28s/it]Train:   6%|▌         | 289/5000 [1:09:00<18:41:23, 14.28s/it]Train:   6%|▌         | 290/5000 [1:09:14<18:41:25, 14.29s/it]                                                              {'loss': 1.89417572, 'token_acc': 0.60066996, 'grad_norm': 0.3216432, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069557, 'epoch': 0.06, 'global_step/max_steps': '290/5000', 'percentage': '5.80%', 'elapsed_time': '1h 9m 14s', 'remaining_time': '18h 44m 36s'}
+Train:   6%|▌         | 290/5000 [1:09:14<18:41:25, 14.29s/it]Train:   6%|▌         | 290/5000 [1:09:14<18:41:25, 14.29s/it]Train:   6%|▌         | 291/5000 [1:09:28<18:41:19, 14.29s/it]Train:   6%|▌         | 292/5000 [1:09:43<18:41:05, 14.29s/it]W0916 01:24:58.014000 136505867785728 torch/distributed/elastic/agent/server/api.py:688] Received Signals.SIGTERM death signal, shutting down workers
+W0916 01:24:58.019000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451300 closing signal SIGTERM
+W0916 01:24:58.019000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451301 closing signal SIGTERM
+W0916 01:24:58.019000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451302 closing signal SIGTERM
+W0916 01:24:58.019000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451303 closing signal SIGTERM
+W0916 01:24:58.020000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451304 closing signal SIGTERM
+W0916 01:24:58.020000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451305 closing signal SIGTERM
+W0916 01:24:58.020000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451306 closing signal SIGTERM
+W0916 01:24:58.020000 136505867785728 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1451307 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
+    result = self._invoke_run(role)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 835, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 79, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 1451222 got signal: 15
+++++ readlink -f cpt_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/cpt_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ model_name=Qwen3-4B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ train_dataset=($ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl)
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl
++ per_device_train_batch_size=24
++ per_device_eval_batch_size=24
++ gradient_accumulation_steps=3
++ max_lengths=2048
++ max_steps=5000
++ task=cpt_10lang_mono
++ tag=0.5B
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ cp cpt_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/train.log
++ swift pt --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 24 --per_device_eval_batch_size 24 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000
+[2025-09-16 01:27:38,384] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 24 --per_device_eval_batch_size 24 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+W0916 01:27:45.371000 132049702192640 torch/distributed/elastic/agent/server/api.py:688] Received Signals.SIGTERM death signal, shutting down workers
+W0916 01:27:45.371000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499252 closing signal SIGTERM
+W0916 01:27:45.371000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499253 closing signal SIGTERM
+W0916 01:27:45.372000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499254 closing signal SIGTERM
+W0916 01:27:45.372000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499255 closing signal SIGTERM
+W0916 01:27:45.372000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499256 closing signal SIGTERM
+W0916 01:27:45.372000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499257 closing signal SIGTERM
+W0916 01:27:45.372000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499258 closing signal SIGTERM
+W0916 01:27:45.372000 132049702192640 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1499259 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
+    return f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
+    result = self._invoke_run(role)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 835, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 79, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 1499173 got signal: 15
+++++ readlink -f cpt_mt_4b.sh
++++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr/cpt_mt_4b.sh
+++ dirname /mnt/nvme1/luoyingfeng/llm-mt/scripts_arr
++ ROOT_DIR=/mnt/nvme1/luoyingfeng/llm-mt
++ export HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ HF_HOME=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ MODELSCOPE_CACHE=/mnt/nvme1/luoyingfeng/llm-mt/cache/
++ export HF_EVALUATE_OFFLINE=1
++ HF_EVALUATE_OFFLINE=1
++ export HF_DATASETS_OFFLINE=1
++ HF_DATASETS_OFFLINE=1
++ export NPROC_PER_NODE=8
++ NPROC_PER_NODE=8
++ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
++ model_name=Qwen3-4B-Base
++ model_dir=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
++ config_file=/mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json
++ train_dataset=($ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl)
++ val_dataset=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl
++ per_device_train_batch_size=25
++ per_device_eval_batch_size=25
++ gradient_accumulation_steps=3
++ max_lengths=2048
++ max_steps=5000
++ task=cpt_10lang_mono
++ tag=0.5B
++ output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ mkdir -p /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ cp cpt_mt_4b.sh /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B
++ swift pt --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000
++ tee /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/train.log
+[2025-09-16 01:28:52,501] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+run sh: `/mnt/nvme1/luoyingfeng/h200_ms/bin/python -m torch.distributed.run --nproc_per_node 8 /mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/cli/pt.py --deepspeed /mnt/nvme1/luoyingfeng/llm-mt/configs/ds_z2_config_bf16.json --add_version False --check_model False --model /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base --train_type full --streaming true --packing true --attn_impl flash_attn --dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl --split_dataset_ratio 0 --val_dataset /mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl --torch_dtype bfloat16 --per_device_train_batch_size 25 --per_device_eval_batch_size 25 --learning_rate 2e-5 --warmup_ratio 0.05 --gradient_accumulation_steps 3 --save_strategy steps --logging_strategy steps --eval_strategy steps --eval_steps 1000 --save_steps 1000 --logging_steps 10 --max_length 2048 --max_steps 5000 --output_dir /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B --dataloader_num_workers 8 --dataset_num_proc 1 --seed 42 --report_to tensorboard --ddp_timeout 180000000`
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2025-09-16 01:28:59,266] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 01:28:59,539] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 01:28:59,663] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 01:28:59,750] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 01:28:59,763] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 01:28:59,799] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[2025-09-16 01:28:59,937] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-16 01:28:59,983] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  def forward(ctx, input, weight, bias=None):
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+  def backward(ctx, grad_output):
+[INFO:swift] Successfully registered `/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] Because len(args.val_dataset) > 0, setting split_dataset_ratio: 0.0
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}}
+[2025-09-16 01:29:00,983] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 01:29:00,983] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[W916 01:29:00.829333715 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 01:29:01,082] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 01:29:01.935010487 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 01:29:01,110] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 01:29:01.956466318 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=False,
+agent_template=None,
+aligner_lr=None,
+attn_impl=flash_attn,
+auto_find_batch_size=False,
+average_tokens_across_devices=False,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=False,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=8,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/train1.jsonl'],
+dataset_num_proc=1,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=180000000,
+debug=None,
+deepspeed={'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'zero_allow_untested_optimizer': True, 'fp16': {'enabled': False, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'zero_optimization': {'stage': 2, 'allgather_partitions': True, 'allgather_bucket_size': 500000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 500000000.0, 'contiguous_gradients': True, 'round_robin_gradients': True}},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=1000.0,
+eval_strategy=steps,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=True,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=3,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=2e-05,
+length_column_name=length,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=10,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=2048,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=5000,
+metric=None,
+metric_for_best_model=loss,
+model=/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen3,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=3.0,
+optim=adamw_torch,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B,
+overwrite_output_dir=False,
+packing=True,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=25,
+per_device_train_batch_size=25,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=1000.0,
+save_strategy=steps,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.0,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=True,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_parameters=None,
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen3,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tp_size=0,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=['/mnt/nvme1/luoyingfeng/llm-mt/data_arr/10lang_cpt_mono_0.5B/valid.jsonl'],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+[INFO:swift] Loading the model using model_dir: /mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base
+[INFO:swift] attn_impl: flash_attn
+[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][2025-09-16 01:29:01,970] [INFO] [comm.py:637:init_distributed] cdb=None
+[2025-09-16 01:29:01,975] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 01:29:01.822770585 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[W916 01:29:01.826079848 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 01:29:02,013] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 01:29:02.861727529 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 01:29:02,022] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 01:29:02.870953036 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+[2025-09-16 01:29:02,136] [INFO] [comm.py:637:init_distributed] cdb=None
+[W916 01:29:02.988242244 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:03,  1.61s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.13s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.20s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.05it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.07it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.01s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.03s/it]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.15s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.10s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.30s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:01<00:00,  1.06it/s]Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.57it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.10it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.34it/s]
+[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
+[INFO:swift] model_info: ModelInfo(model_type='qwen3', model_dir='/mnt/nvme3/luoyingfeng/model_card/Qwen3-4B-Base', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen3Config {
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 64,
+  "pad_token_id": 151643
+}
+
+[INFO:swift] Setting args.use_chat_template: False
+[INFO:swift] Setting args.loss_scale: 'all'
+[INFO:swift] default_system: None
+[INFO:swift] max_length: 2048
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: react_en
+[INFO:swift] Start time of running main: 2025-09-16 01:29:04.435786
+[INFO:swift] swift.__version__: 3.7.3
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:01<00:00,  1.02it/s]Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.52it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.16s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.27it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.10s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.10s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.20s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.24it/s]
+[INFO:swift] train_dataset: IterableDataset({
+    features: ['messages'],
+    num_shards: 1
+})
+[INFO:swift] val_dataset: IterableDataset({
+    features: ['messages'],
+    num_shards: 1
+})
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] [INPUT_IDS] [18493, 107167, 105595, 5373, 99257, 5373, 102438, 3837, 104695, 61443, 38182, 104745, 100003, 3837, 104745, 104625, 87335, 101286, 3837, 101897, 103947, 100383, 101118, 3837, 67338, 102064, 99877, 36407, 102124, 46944, 100220, 100240, 9370, 111048, 1773, 100624, 100141, 104745, 107343, 107548, 101036, 11319, 114566, 100452, 105191, 104387, 104197, 105285, 14777, 104745, 22, 99824, 3837, 100437, 101113, 3837, 109477, 100006, 99729, 8997, 26288, 105285, 14777, 104745, 10236, 107, 229, 16, 198, 2073, 53222, 111241, 104444, 3837, 99934, 99528, 49082, 3837, 99621, 16628, 99741, 3837, 50009, 108052, 854, 20412, 101988, 102376, 71268, 100645, 100854, 99195, 108008, 3837, 88308, 117159, 1773, 100632, 3837, 107954, 100090, 7948, 104890, 107935, 8997, 26288, 105285, 108739, 99391, 3837, 35946, 112181, 29490, 102300, 99830, 3837, 99621, 52801, 16628, 99741, 3837, 102231, 29490, 109979, 104781, 3837, 35946, 17447, 95256, 105611, 102440, 101187, 108040, 103167, 33108, 111877, 48921, 17881, 104006, 114339, 3837, 16872, 95256, 105611, 99378, 95256, 9370, 75437, 99413, 102693, 3837, 102586, 99619, 17447, 101347, 100434, 74040, 38035, 9370, 101433, 108040, 103167, 3837, 104402, 49828, 99795, 102020, 3837, 101228, 57566, 99226, 34187, 6313, 107967, 101077, 104334, 3837, 102208, 113305, 30709, 100343, 3837, 104038, 110272, 3837, 69041, 107723, 105883, 6313, 198, 104276, 9370, 35946, 105694, 99226, 34187, 6313, 108907, 117388, 117190, 102376, 9973, 6313, 198, 113827, 3837, 97639, 36587, 36587, 116423, 3837, 101997, 108759, 9370, 104996, 3837, 74763, 62922, 99639, 86117, 99208, 99594, 1773, 102175, 77540, 100484, 104197, 99613, 44636, 26288, 101174, 101355, 3837, 30709, 99808, 99679, 99318, 99318, 9370, 3837, 105611, 101347, 100167, 117638, 99243, 9370, 102430, 119364, 3837, 88051, 99639, 99708, 105664, 99949, 8903, 117464, 9973, 6313, 198, 104020, 26939, 107723, 34187, 3837, 35946, 102313, 29490, 112198, 104853, 39953, 3837, 115232, 108538, 28072, 67279, 3837, 111128, 110961, 105871, 110926, 100090, 7948, 34187, 6313, 198, 101140, 3837, 97639, 99495, 103088, 105943, 45629, 3837, 35946, 104398, 100688, 3837, 100549, 103088, 105943, 36987, 99477, 29524, 111903, 100523, 56006, 110194, 5691, 6313, 99945, 103755, 103755, 119392, 62, 63109, 29490, 103206, 1773, 99745, 99593, 26288, 99360, 100443, 115833, 3837, 110267, 100443, 53222, 105480, 67279, 69249, 3837, 102313, 99226, 34187, 1773, 108954, 5122, 99601, 100659, 104103, 100021, 105953, 6313, 16530, 105153, 99405, 3837, 16530, 105153, 99621, 9370, 3837, 88051, 52801, 8997, 104326, 3837, 97639, 102149, 44991, 101371, 99680, 45629, 3837, 102208, 104494, 26288, 104611, 36987, 100090, 104300, 99495, 75758, 35946, 104398, 102608, 100549, 44991, 101371, 99680, 36987, 99421, 37984, 116322, 75758, 220, 44991, 101371, 99680, 45629, 99350, 101467, 99232, 3837, 113093, 111121, 35946, 8997, 104221, 3837, 97639, 97706, 102149, 101371, 105943, 3837, 40820, 101849, 3837, 40820, 101371, 99680, 14053, 45629, 8997, 100644, 3837, 35946, 104619, 104686, 3837, 104979, 102154, 99350, 6313, 107924, 32847, 6313, 198, 112128, 99595, 100167, 100066, 3837, 102197, 99465, 99641, 99528, 100040, 1773, 33108, 99225, 51827, 99462, 100251, 3837, 108441, 103303, 16628, 101253, 1773, 104596, 108658, 9370, 116467, 69249, 3837, 105786, 117019, 33108, 30709, 103088, 101949, 100018, 107954, 3837, 100678, 36587, 26288, 105285, 104173, 99851, 99567, 9370, 101036, 11319, 14880, 49187, 35946, 101283, 44793, 36407, 8997, 99321, 38953, 26288, 105285, 14777, 101219, 111241, 104444, 70074, 15946, 106517, 3837, 101959, 106235, 101039, 112181, 111505, 99938, 26939, 118184, 3837, 53222, 100451, 30858, 100183, 1773, 35946, 44991, 16872, 75108, 20755, 40820, 99360, 101124, 100451, 30858, 100183, 104089, 33108, 117019, 111435, 105514, 3837, 97639, 105939, 75437, 79599, 32648, 33108, 100451, 30858, 100183, 3837, 99355, 99907, 99907, 9370, 104071, 118184, 3837, 101077, 53222, 8997, 107053, 105786, 35946, 103088, 99818, 99360, 100451, 30858, 100183, 28072, 99793, 3837, 99258, 117019, 111678, 108304, 9370, 13, 109703, 115257, 1773, 97639, 105779, 104525, 99164, 3837, 104995, 38182, 99593, 83031, 3837, 105786, 35946, 103088, 99818, 80158, 109482, 100451, 30858, 100183, 3837, 107053, 100451, 30858, 100183, 99283, 108270, 104291, 3837, 107172, 115257, 99225, 100307, 119607, 99610, 3837, 103952, 102837, 74763, 101067, 100451, 30858, 100183, 9370, 104291, 5691, 11319, 109148, 100254, 24071, 222, 31843, 69, 31207, 104791, 100146, 28311, 97639, 99466, 101421, 71268, 100588, 109412, 3837, 108250, 104267, 99705, 98650, 102114, 34187, 3837, 35946, 116347, 100451, 30858, 100183, 36993, 100228, 34187, 111053, 3837, 63109, 99624, 99624, 9370, 100421, 3837, 44934, 101998, 100195, 100868, 1773, 106210, 3837, 97639, 99466, 91680, 99880, 100451, 30858, 100183, 26232, 104291, 3837, 62922, 112151, 111053, 100484, 1773, 35946, 99212, 101492, 119691, 119680, 107352, 101421, 99236, 100421, 99236, 99234, 3837, 35946, 116578, 102265, 105003, 99631, 17340, 118082, 9370, 99695, 27773, 8997, 18493, 108071, 103947, 2073, 103082, 99317, 854, 15946, 3837, 35946, 103961, 105545, 111343, 106125, 3837, 101959, 104115, 36587, 36987, 111343, 106125, 3837, 110622, 100451, 30858, 100183, 9370, 100843, 99691, 64720, 38212, 3837, 102572, 81264, 107331, 97639, 101077, 106202, 13343, 3837, 104233, 117245, 99518, 114434, 113793, 104140, 105007, 2293, 2073, 100451, 30858, 100183, 99518, 101957, 101957, 104291, 34187, 75758, 2073, 110709, 107172, 99259, 100819, 116221, 3837, 854, 35946, 100180, 114551, 41505, 65101, 99259, 99180, 77959, 101891, 3837, 102636, 99561, 35946, 34187, 5691, 6313, 100458, 122113, 100027, 119128, 99869, 102984, 121285, 35496, 231, 5691, 3837, 29524, 68862, 29258, 99693, 1773, 43288, 101228, 100226, 99851, 104099, 6313, 198, 99487, 100451, 30858, 100183, 104233, 102346, 18947, 99851, 99567, 104197, 105285, 14777, 3837, 106884, 104233, 110586, 3837, 104284, 30534, 104211, 99466, 107954, 13343, 53222, 111241, 104444, 5373, 53222, 100451, 30858, 100183, 33108, 102657, 99849, 102045, 13343, 3837, 60533, 99464, 3837, 62922, 65101, 97639, 99654, 99851, 99567, 8997, 102376, 108659, 101988, 31235, 109815, 103446, 9370, 105832, 1773, 97611, 105686, 101199, 104417, 3837, 99212, 104115, 69041, 99466, 109432, 104417, 9370, 102376, 112322, 100003, 8997, 26288, 7948, 105134, 3837, 104902, 33108, 26288, 100659, 104278, 112181, 9370, 109195, 3837, 99634, 119477, 104334, 3837, 97639, 106138, 99405, 99391, 99938, 34187, 3837, 105859, 100464, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 99305, 118673, 100213, 1773, 106864, 100000, 100186, 82647, 9370, 3837, 99652, 106411, 99610, 42140, 44729, 42140, 99477, 8997, 99405, 106570, 99391, 99938, 3837, 97639, 106138, 99934, 99528, 49082, 34187, 1773, 101140, 99360, 99528, 49082, 98279, 88991, 103958, 3837, 87256, 99360, 100802, 63836, 99934, 17447, 105279, 100773, 111786, 1773, 99934, 99477, 18600, 13343, 3837, 30534, 99805, 99610, 99934, 3837, 51463, 99477, 99495, 8997, 99495, 102172, 77540, 110869, 75061, 3837, 97639, 104134, 71134, 17447, 16628, 102214, 1773, 18493, 102458, 111678, 104444, 102045, 1773, 111678, 33447, 80158, 30440, 99405, 107946, 99938, 34187, 1773, 100655, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 7948, 7948, 18830, 88683, 1773, 97706, 99992, 44793, 99929, 102100, 109861, 39426, 9370, 99800, 3837, 105139, 105863, 101425, 3837, 99652, 106411, 99610, 99832, 85361, 111226, 8997, 99495, 104030, 13343, 3837, 45629, 45629, 17523, 17523, 71268, 53222, 102300, 103850, 1773, 101916, 106519, 105231, 103850, 105484, 3837, 99212, 103850, 116943, 103585, 3837, 101280, 99226, 34187, 3837, 103973, 29732, 16530, 116435, 29077, 3837, 38182, 106570, 102376, 3837, 118581, 99518, 107651, 3837, 105666, 33108, 104902, 111656, 107847, 104432, 104066, 33108, 100134, 15946, 3837, 100549, 99466, 99257, 102088, 3837, 100134, 101300, 8997, 88308, 104197, 105285, 14777, 104037, 100654, 3837, 99519, 91777, 104128, 71618, 112096, 101280, 104197, 100167, 8997, 104301, 35946, 26288, 105285, 14777, 99774, 35727, 3837, 104624, 35946, 104053, 99774, 35727, 8997, 26288, 105285, 14777, 9370, 104030, 3837, 106047, 104071, 106954, 99366, 3837, 105786, 42411, 109216, 36987, 104335, 110138, 114154, 100003, 5691, 6313, 120144, 119557, 122204, 121285, 119773, 121059, 119142, 170, 222, 254, 5691, 8997, 97639, 92133, 104059, 99857, 99525, 110792, 24968, 11622, 102634, 101953, 99360, 63703, 27091, 71268, 101413, 34794, 3837, 114312, 100802, 63836, 1773, 99518, 11622, 72990, 100241, 99190, 2073, 121999, 44729, 854, 99360, 100050, 104957, 18493, 104135, 3837, 100420, 87256, 101358, 106804, 109703, 115257, 3837, 99654, 3837, 103952, 114154, 80158, 19108, 34187, 8997, 106356, 99314, 101180, 3837, 101952, 97639, 99190, 99692, 114154, 36587, 36987, 26288, 38182, 104300, 3837, 99243, 102284, 16530, 108738, 3837, 105603, 34187, 87256, 29258, 99190, 100003, 75758, 35946, 108954, 36987, 104389, 99232, 103554, 111031, 100854, 114154, 106041, 105603, 100373, 81264, 105420, 3837, 35946, 101144, 18947, 110721, 5122, 100561, 36407, 100369, 26288, 106099, 100822, 3837, 99360, 118579, 17447, 104165, 105151, 107495, 100194, 3837, 99934, 18493, 104135, 8997, 113573, 100644, 104458, 106356, 105511, 3837, 35946, 111256, 99822, 108032, 85336, 105275, 18947, 107000, 104150, 89012, 106356, 99405, 3837, 112696, 109785, 108927, 74763, 101161, 3837, 110567, 106356, 70927, 36407, 8997, 103083, 106356, 70927, 36407, 103920, 3837, 110267, 107000, 113121, 101255, 17447, 109703, 115257, 1773, 106356, 101161, 3837, 100549, 100523, 74763, 107651, 1773, 14777, 112594, 112594, 106578, 9370, 79599, 101233, 101286, 114177, 109003, 9370, 110914, 3837, 100261, 99331, 99164, 106356, 101954, 107288, 101625, 3837, 21, 15, 99408, 110350, 9370, 109703, 115257, 74763, 100421, 27733, 99164, 103952, 21, 15, 18947, 105514, 8997, 106356, 102203, 46306, 109703, 115257, 3837, 97639, 55286, 93149, 106800, 9370, 107000, 1773, 35946, 99677, 32648, 14777, 27733, 3837, 99360, 107000, 101913, 118509, 103077, 101537, 18493, 106356, 9370, 104488, 8997, 108668, 121769, 6313, 106356, 99518, 100514, 91777, 97706, 99914, 34187, 6313, 198, 102376, 100090, 7948, 102788, 105523, 107477, 65676, 5691, 11319, 99381, 241, 99971, 100042, 5691, 8997, 105285, 40820, 118666, 3837, 101935, 80158, 101611, 35946, 109195, 3837, 36587, 100644, 30534, 26939, 106356, 5373, 105943, 45629, 100090, 7948, 1773, 35946, 111912, 3837, 102483, 99226, 34187, 3837, 110665, 109195, 1773, 111505, 99391, 99938, 3837, 109585, 16628, 102214, 3837, 80158, 33108, 102208, 5373, 101935, 100018, 99901, 39953, 104374, 106356, 45629, 8997, 106356, 45629, 18493, 99474, 16872, 3837, 100343, 101467, 99828, 110070, 80158, 99495, 1773, 35946, 104789, 104990, 106356, 45629, 3837, 106356, 5373, 105943, 106779, 18493, 102458, 111801, 34187, 1773, 35946, 14777, 101038, 106356, 5373, 105943, 3837, 80158, 102483, 29490, 99882, 109412, 36987, 106356, 5373, 105943, 3837, 97639, 36407, 116416, 100090, 7948, 34187, 75758, 106356, 5373, 105943, 99350, 106397, 29490, 48738, 18947, 101677, 8997, 105480, 106356, 5373, 105943, 45629, 3837, 99650, 80158, 104169, 99674, 104686, 106678, 100413, 3837, 18830, 104618, 5373, 18830, 100443, 5373, 18830, 109378, 104008, 1773, 35946, 104203, 99405, 3837, 106356, 104203, 107557, 36987, 100134, 108702, 3837, 104710, 101300, 55807, 39165, 102654, 35946, 118566, 56006, 103982, 111244, 101300, 13343, 3837, 106356, 102483, 29490, 103206, 3837, 113093, 101651, 35946, 99929, 105414, 99518, 117045, 90395, 115833, 46944, 108052, 8997, 35946, 102483, 29490, 29077, 102346, 108052, 3837, 54926, 36587, 102570, 1773, 77288, 105519, 3837, 35946, 106961, 101181, 101208, 100626, 101235, 104701, 3837, 99999, 3837, 35946, 116091, 28291, 103421, 5122, 18493, 16628, 100695, 69249, 3837, 103962, 101896, 116240, 29490, 100134, 3837, 100627, 100716, 3837, 108287, 106961, 101181, 104701, 8997, 111505, 15946, 99938, 3837, 106235, 108526, 34187, 106356, 5373, 105943, 3837, 99901, 39953, 104122, 34187, 8997, 100090, 7948, 115672, 20412, 14224, 65676, 5691, 11319, 99381, 241, 99971, 5691, 8997, 105285, 40820, 3837, 35946, 112181, 109195, 3837, 109585, 16628, 102214, 5373, 16628, 113233, 33108, 16628, 110383, 3837, 101077, 99557, 110961, 100626, 117999, 14053, 85336, 101849, 106356, 45629, 85336, 100090, 7948, 3837, 35946, 30440, 102313, 34187, 8997, 101849, 106356, 45629, 18493, 42144, 109, 17447, 3837, 99486, 85336, 99582, 100230, 100696, 3837, 99165, 59258, 1773, 102865, 45629, 109967, 106825, 99593, 100408, 107500, 3837, 105285, 40820, 85336, 100090, 101506, 108560, 42140, 3837, 107500, 17447, 54926, 46944, 34794, 99901, 24156, 104338, 3837, 101228, 102248, 115507, 9973, 6313, 35946, 54926, 70790, 100371, 104338, 3837, 108432, 18830, 110961, 102865, 102144, 1773, 16530, 106868, 80158, 99495, 42144, 109, 17447, 3837, 104853, 39953, 3837, 99540, 17254, 99246, 103287, 99639, 59956, 59956, 102199, 97706, 99992, 45629, 31235, 113911, 102297, 59074, 3837, 101935, 111267, 105275, 97084, 105950, 1773, 117222, 3837, 80158, 104356, 101849, 106356, 103348, 99333, 27091, 29490, 33108, 97639, 116657, 3837, 35946, 99694, 39426, 103427, 36987, 101849, 99989, 75107, 91050, 7948, 52801, 75758, 101849, 106356, 36587, 36987, 107924, 52801, 6313, 107924, 52801, 75758, 107484, 3837, 80158, 99213, 99164, 97639, 104071, 42411, 45629, 3837, 101849, 106356, 45629, 106789, 63703, 99432, 3837, 30440, 106488, 99314, 99180, 103425, 103138, 103138, 3837, 116493, 99847, 17340, 34187, 8997, 14777, 104400, 101849, 106356, 45629, 3837, 99650, 113641, 102871, 106678, 3837, 18830, 100857, 44729, 5373, 108202, 102030, 5373, 100180, 44729, 5373, 101580, 99251, 14053, 3837, 100132, 109366, 99405, 3837, 35946, 104203, 99405, 99164, 100413, 3837, 104203, 101952, 100234, 1773, 99650, 97706, 107557, 3837, 100134, 108702, 3837, 104710, 101300, 8997, 97639, 99437, 99610, 99437, 99610, 80158, 26939, 99405, 117371, 20450, 34187, 3837, 35946, 26288, 39426, 26288, 39426, 29490, 99405, 99164, 3837, 101849, 99680, 11622, 46944, 99491, 99491, 30709, 100050, 100749, 3837, 104169, 99495, 108223, 100749, 100167, 102461, 3837, 35946, 100399, 117459, 3837, 101228, 102308, 99226, 34187, 8997, 109327, 99938, 106735, 3837, 101849, 105943, 115833, 46944, 108052, 3837, 100549, 35946, 102231, 100134, 1773, 35946, 102483, 29490, 111682, 108052, 3837, 105317, 70074, 2073, 102570, 33590, 108526, 34187, 101849, 105943, 3837, 20742, 117999, 80158, 113305, 39953, 36605, 97639, 104122, 34187, 8997, 108668, 3837, 100090, 7948, 100681, 30440, 88051, 52801, 9973, 6313, 151645, 44, 51413, 13422, 7777, 11670, 36600, 16502, 645, 37343, 644, 35246, 13, 40436, 2636, 2922, 3346, 1424, 4844, 323, 21511, 10450, 645, 13, 472, 73535, 13, 151645]
+[INFO:swift] [INPUT] 在平凡的学习、工作、生活中，大家都写过作文吧，作文是由文字组成，经过人的思想考虑，通过语言组织来表达一个主题意义的文体。那么一般作文是怎么写的呢？以下是小编精心整理的大年初一作文7篇，欢迎阅读，希望大家能够喜欢。
+大年初一作文 篇1
+“放鞭炮，贴春联，穿新衣，收红包”是每年春节都必须做的几件事，今年也不例外。不过，过年拜年是最有趣的。
+大年初一一早，我早早地起了床，穿好新衣，好好地打扮了一下，我上身穿着白色羊绒衫和黑白相间的小裙子，下身穿着紧身的打底裤，外面套上一件渐变色的羽绒衫，搭配得自然协调，真是美极了！一切都准备好了，爸爸开着小汽车，带着一家人，向老家前进！
+此时的我激动极了！这是我盼望已久的春节��！
+一路上，我们说说笑笑，看看路边的风景，也别是一番风趣。公路两旁的大树高大挺拔，小草绿油油的，穿着一件雪白雪白的棉袄，真是一幅美丽的冬日画卷��！
+终于到老家了，我开心地蹦下了车，拎着手提包，拉着爸爸妈妈的手一起去拜年了！
+首先，我们到了姨奶奶家，我走了过去，祝姨奶奶：“福如东海寿比南山�！币棠棠涕_心地笑了。抓了一大把糖给了我，我把糖放进了包里，开心极了。心想：现在人们的生活水平提高了！不愁吃，不愁穿的，真好。
+接下来，我们去了三姑妈家，爸爸一声大喊：“拜年的到了！”我走了上去祝三姑妈：“财源滚滚！” 三姑妈家乐开了花，连连称赞我。
+随后，我们还去了姑奶奶，二舅，二姑妈……家。
+今天，我收获了很多，同时也很快乐！新年Happy！
+梅花伴雪舞，祥龙迎春归。和光布德泽，万物沐新辉。在这个短暂的寒假里，我和老妈和小姨一家一起过年，为什么说大年初一是惊险的呢？请听我慢慢道来。
+往常大年初一是在鞭炮声中度过，于是我们就计划早早吃过饭到院子里，放孔明灯。我三下五除二把三个孔明灯打开和老妈写下祝福，我们拿着打火机和孔明灯，兴冲冲的来到院子里，准备放。
+只见我和我姨夫把孔明灯提起来，让老妈点燃底部的.蜡烛。我们耐心等待着，大约过了一分钟，我和我姨夫就放开孔明灯，只见孔明灯自己缓缓上升，里面的烛光摇曳著，我们的目光也随着孔明灯的上升�？勺屏覀内f万没想到的是：
+我们大家的心都悬起来了，刚开始的新鲜感也没有了，我生怕孔明灯会烧了电线，心突突的跳，手心里出了汗。此刻，我们大家只希望孔明灯能上升，别停留在电线旁。我那颗忐忑不安的心越跳越快，我都不敢想象惨绝人寰的恶果。
+在全家人的“痴望”中，我突然想起要不要报警，于是我就说：“要不要报警，万一孔明灯的金属丝导电，怎么办？”正当我们准备打电话时，让我们意想不到又欣喜万分的事发生了—“孔明灯又徐徐上升了！”“原来是里面的热空气太少，”我松了一口气，“像热气球一样，吓死我了�！贝蠹叶妓闪艘豢跉�，如释重担。这真是虚惊一场！
+这个孔明灯让我们过了个惊险的大年初一，但也让我们难忘，我也要提醒大家过年时放鞭炮、放孔明灯和别的爆竹时，注意安全，别像我们这样惊险。
+春节是我国每年最盛大隆重的节日。我的家乡处于南方，那我就向大家介绍一下南方的春节习俗吧。
+大年三十，小孩和大人们都要早早的起床，洗漱好了，我们就开始吃早饭了，茶叶蛋是不可少的食物，它象征著团团圆圆。粥也是不可少的，它象征著多子多福。
+吃完了早饭，我们就开始贴春联了。首先把春联移到正确的地方，再把四个角贴上透明胶就行了。贴福字时，要倒著贴，表示福到了。
+到了下午两点多钟，我们就要换上新衣服。在门口点燃炮竹。点燃后就可吃年夜饭了。鱼是不可少的食物，它象征著年年有余。还有一道既营养还可口的菜，那就是玉米粒，它象征著荣华富贵。
+到了晚上时，家家户户都放起了烟花。天空顿时变成了烟花的世界，那烟花绚丽多彩，美丽极了，让人目不暇接，过完了春节，新的一��又开始了，大人和小孩们都进入了紧张的工作和学习中，祝大家工作顺利，学习进步。
+今年的大年初一有点特别，因为老天下起了一场美丽的大雪。
+这就是我大年初一的一天，这也是我快乐的一天。
+大年初一的晚上，弟弟来到我家玩，我和他商量：“咱们来做灯笼吧�！钡艿芤豢诒愦馍�。
+我们找了一个废酒盒子；用剪刀把四面都挖空，留住四个角。又用土办法做“糨子”把纸粘在上面，里面再固定一根蜡烛，这样，我们的灯笼就成功了。
+爷爷走过来，看着我们做好的灯笼说：“大过年的，白颜色不吉利，扔了再重做吧！”我心想：“人家花半天工夫做的灯笼就这样扔掉？”忽然，我有了个主意：搬来两个大饮料瓶，把瓶子上红色标签撕下来，贴在上面。
+恰好今天又是爷爷生日，我用自己的零花钱去买了个蛋糕回来给爷爷吃，回到家才发现亲人也来了，只剩下爷爷没来。
+趁爷爷没来的时候，我把蛋糕拿出来插上蜡烛。爷爷来了，祝寿也开始了。一簇簇燃烧的火苗组成一朵吉祥的莲花，映照着爷爷幸福的脸庞，60根彩色的蜡烛也跳动着我们的60个祝福。
+爷爷吹完蜡烛，我们开始分享美味的蛋糕。我灵机一动，把蛋糕上的奶油一下子抹在爷爷的脸上。
+哇噻！爷爷又返老还童了！
+春节拜年对我来讲是一件非�？鞓肥虑�。
+年初二一大早，妈妈就催我起床，说今天要到爷爷、奶奶家拜年。我一听，高兴极了，连忙起床。吃过早饭，穿上新衣服，就和爸爸、妈妈一起坐车前往爷爷家。
+爷爷家在乡下，汽车开了不到半小时就到了。我还没走到爷爷家，爷爷、奶奶就已经在门口等候了。我一看到爷爷、奶奶，就高兴地叫起来了：“爷爷、奶奶，我们来给你们拜年了！”爷爷、奶奶乐呵呵地笑个不停。
+进了爷爷、奶奶家，他们就给我拿了很多好吃东西，有水果、有糖、有花生等等。我一边吃，爷爷一边问我：“学习好不好，有没有进步”。当听说我学习成绩比以前有很大进步时，爷爷高兴地笑了，连连夸我既聪明又懂事，并给了我一个红包。
+我高兴地接过了红包，连说谢谢。但我知道，我与其他同学相比还有很大差距，所以，我暗暗发誓：在新一年里，一定要更加刻苦地学习，提高成绩，缩小与其他同学差距。
+吃过中饭，我们就告别了爷爷、奶奶，坐车回家了。
+拜年对我来说是件非�？鞓肥�。
+年初二，我早早起床，穿上新衣服、新裤子和新鞋子，准备跟爸爸妈妈还有舅舅……去舅爷爷家去拜年，我可开心了。
+舅爷爷家在墱上，就是去贵池方向，很近。在我家门前乘坐了一辆公交车，年初二去拜年人还真多，公交车上连一个空坐位都没有，真是人群拥挤��！我连站地方都没有，还好有爸爸妈妈在我身边。不一会儿就到了墱上，下了车，印入眼帘是一排排房子还有一家最耀眼购物城，妈妈在里面买了些礼物。不远处，就看见舅爷爷笑容满面地和我们打招呼，我脱口而出：“舅爷爷新年好！”舅爷爷说：“新年好！新年好！”说完，就领着我们来到他家，舅爷爷家住在四楼，可把我走气喘吁吁，实在是太累人了。
+一走进舅爷爷家，他们拿来好多好吃，有瓜子、杏仁、松子、葡萄干……，都是我喜欢吃，我一边吃着东西，一边看着电视。他们还问我，学习好不好，有没有进步。
+我们谈著谈著就到吃午饭时间了，我大口大口地吃着，舅妈用一个非常非常小纸杯，给我到了一小杯雪碧，我喝了一口，真是爽极了。
+吃完饭过后，舅奶奶给了我一个红包，祝我好好学习。我高兴地接过红包，说了声“谢谢”，告别了舅奶奶，表舅舅就开着车送我们回家了。
+哇，拜年感觉可真好��！<|im_end|>Maju kan nggon ku kaie uwong eh. Kabeh enggo handphone and smartphone waie. Hahaha.<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, 107167, 105595, 5373, 99257, 5373, 102438, 3837, 104695, 61443, 38182, 104745, 100003, 3837, 104745, 104625, 87335, 101286, 3837, 101897, 103947, 100383, 101118, 3837, 67338, 102064, 99877, 36407, 102124, 46944, 100220, 100240, 9370, 111048, 1773, 100624, 100141, 104745, 107343, 107548, 101036, 11319, 114566, 100452, 105191, 104387, 104197, 105285, 14777, 104745, 22, 99824, 3837, 100437, 101113, 3837, 109477, 100006, 99729, 8997, 26288, 105285, 14777, 104745, 10236, 107, 229, 16, 198, 2073, 53222, 111241, 104444, 3837, 99934, 99528, 49082, 3837, 99621, 16628, 99741, 3837, 50009, 108052, 854, 20412, 101988, 102376, 71268, 100645, 100854, 99195, 108008, 3837, 88308, 117159, 1773, 100632, 3837, 107954, 100090, 7948, 104890, 107935, 8997, 26288, 105285, 108739, 99391, 3837, 35946, 112181, 29490, 102300, 99830, 3837, 99621, 52801, 16628, 99741, 3837, 102231, 29490, 109979, 104781, 3837, 35946, 17447, 95256, 105611, 102440, 101187, 108040, 103167, 33108, 111877, 48921, 17881, 104006, 114339, 3837, 16872, 95256, 105611, 99378, 95256, 9370, 75437, 99413, 102693, 3837, 102586, 99619, 17447, 101347, 100434, 74040, 38035, 9370, 101433, 108040, 103167, 3837, 104402, 49828, 99795, 102020, 3837, 101228, 57566, 99226, 34187, 6313, 107967, 101077, 104334, 3837, 102208, 113305, 30709, 100343, 3837, 104038, 110272, 3837, 69041, 107723, 105883, 6313, 198, 104276, 9370, 35946, 105694, 99226, 34187, 6313, 108907, 117388, 117190, 102376, 9973, 6313, 198, 113827, 3837, 97639, 36587, 36587, 116423, 3837, 101997, 108759, 9370, 104996, 3837, 74763, 62922, 99639, 86117, 99208, 99594, 1773, 102175, 77540, 100484, 104197, 99613, 44636, 26288, 101174, 101355, 3837, 30709, 99808, 99679, 99318, 99318, 9370, 3837, 105611, 101347, 100167, 117638, 99243, 9370, 102430, 119364, 3837, 88051, 99639, 99708, 105664, 99949, 8903, 117464, 9973, 6313, 198, 104020, 26939, 107723, 34187, 3837, 35946, 102313, 29490, 112198, 104853, 39953, 3837, 115232, 108538, 28072, 67279, 3837, 111128, 110961, 105871, 110926, 100090, 7948, 34187, 6313, 198, 101140, 3837, 97639, 99495, 103088, 105943, 45629, 3837, 35946, 104398, 100688, 3837, 100549, 103088, 105943, 36987, 99477, 29524, 111903, 100523, 56006, 110194, 5691, 6313, 99945, 103755, 103755, 119392, 62, 63109, 29490, 103206, 1773, 99745, 99593, 26288, 99360, 100443, 115833, 3837, 110267, 100443, 53222, 105480, 67279, 69249, 3837, 102313, 99226, 34187, 1773, 108954, 5122, 99601, 100659, 104103, 100021, 105953, 6313, 16530, 105153, 99405, 3837, 16530, 105153, 99621, 9370, 3837, 88051, 52801, 8997, 104326, 3837, 97639, 102149, 44991, 101371, 99680, 45629, 3837, 102208, 104494, 26288, 104611, 36987, 100090, 104300, 99495, 75758, 35946, 104398, 102608, 100549, 44991, 101371, 99680, 36987, 99421, 37984, 116322, 75758, 220, 44991, 101371, 99680, 45629, 99350, 101467, 99232, 3837, 113093, 111121, 35946, 8997, 104221, 3837, 97639, 97706, 102149, 101371, 105943, 3837, 40820, 101849, 3837, 40820, 101371, 99680, 14053, 45629, 8997, 100644, 3837, 35946, 104619, 104686, 3837, 104979, 102154, 99350, 6313, 107924, 32847, 6313, 198, 112128, 99595, 100167, 100066, 3837, 102197, 99465, 99641, 99528, 100040, 1773, 33108, 99225, 51827, 99462, 100251, 3837, 108441, 103303, 16628, 101253, 1773, 104596, 108658, 9370, 116467, 69249, 3837, 105786, 117019, 33108, 30709, 103088, 101949, 100018, 107954, 3837, 100678, 36587, 26288, 105285, 104173, 99851, 99567, 9370, 101036, 11319, 14880, 49187, 35946, 101283, 44793, 36407, 8997, 99321, 38953, 26288, 105285, 14777, 101219, 111241, 104444, 70074, 15946, 106517, 3837, 101959, 106235, 101039, 112181, 111505, 99938, 26939, 118184, 3837, 53222, 100451, 30858, 100183, 1773, 35946, 44991, 16872, 75108, 20755, 40820, 99360, 101124, 100451, 30858, 100183, 104089, 33108, 117019, 111435, 105514, 3837, 97639, 105939, 75437, 79599, 32648, 33108, 100451, 30858, 100183, 3837, 99355, 99907, 99907, 9370, 104071, 118184, 3837, 101077, 53222, 8997, 107053, 105786, 35946, 103088, 99818, 99360, 100451, 30858, 100183, 28072, 99793, 3837, 99258, 117019, 111678, 108304, 9370, 13, 109703, 115257, 1773, 97639, 105779, 104525, 99164, 3837, 104995, 38182, 99593, 83031, 3837, 105786, 35946, 103088, 99818, 80158, 109482, 100451, 30858, 100183, 3837, 107053, 100451, 30858, 100183, 99283, 108270, 104291, 3837, 107172, 115257, 99225, 100307, 119607, 99610, 3837, 103952, 102837, 74763, 101067, 100451, 30858, 100183, 9370, 104291, 5691, 11319, 109148, 100254, 24071, 222, 31843, 69, 31207, 104791, 100146, 28311, 97639, 99466, 101421, 71268, 100588, 109412, 3837, 108250, 104267, 99705, 98650, 102114, 34187, 3837, 35946, 116347, 100451, 30858, 100183, 36993, 100228, 34187, 111053, 3837, 63109, 99624, 99624, 9370, 100421, 3837, 44934, 101998, 100195, 100868, 1773, 106210, 3837, 97639, 99466, 91680, 99880, 100451, 30858, 100183, 26232, 104291, 3837, 62922, 112151, 111053, 100484, 1773, 35946, 99212, 101492, 119691, 119680, 107352, 101421, 99236, 100421, 99236, 99234, 3837, 35946, 116578, 102265, 105003, 99631, 17340, 118082, 9370, 99695, 27773, 8997, 18493, 108071, 103947, 2073, 103082, 99317, 854, 15946, 3837, 35946, 103961, 105545, 111343, 106125, 3837, 101959, 104115, 36587, 36987, 111343, 106125, 3837, 110622, 100451, 30858, 100183, 9370, 100843, 99691, 64720, 38212, 3837, 102572, 81264, 107331, 97639, 101077, 106202, 13343, 3837, 104233, 117245, 99518, 114434, 113793, 104140, 105007, 2293, 2073, 100451, 30858, 100183, 99518, 101957, 101957, 104291, 34187, 75758, 2073, 110709, 107172, 99259, 100819, 116221, 3837, 854, 35946, 100180, 114551, 41505, 65101, 99259, 99180, 77959, 101891, 3837, 102636, 99561, 35946, 34187, 5691, 6313, 100458, 122113, 100027, 119128, 99869, 102984, 121285, 35496, 231, 5691, 3837, 29524, 68862, 29258, 99693, 1773, 43288, 101228, 100226, 99851, 104099, 6313, 198, 99487, 100451, 30858, 100183, 104233, 102346, 18947, 99851, 99567, 104197, 105285, 14777, 3837, 106884, 104233, 110586, 3837, 104284, 30534, 104211, 99466, 107954, 13343, 53222, 111241, 104444, 5373, 53222, 100451, 30858, 100183, 33108, 102657, 99849, 102045, 13343, 3837, 60533, 99464, 3837, 62922, 65101, 97639, 99654, 99851, 99567, 8997, 102376, 108659, 101988, 31235, 109815, 103446, 9370, 105832, 1773, 97611, 105686, 101199, 104417, 3837, 99212, 104115, 69041, 99466, 109432, 104417, 9370, 102376, 112322, 100003, 8997, 26288, 7948, 105134, 3837, 104902, 33108, 26288, 100659, 104278, 112181, 9370, 109195, 3837, 99634, 119477, 104334, 3837, 97639, 106138, 99405, 99391, 99938, 34187, 3837, 105859, 100464, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 99305, 118673, 100213, 1773, 106864, 100000, 100186, 82647, 9370, 3837, 99652, 106411, 99610, 42140, 44729, 42140, 99477, 8997, 99405, 106570, 99391, 99938, 3837, 97639, 106138, 99934, 99528, 49082, 34187, 1773, 101140, 99360, 99528, 49082, 98279, 88991, 103958, 3837, 87256, 99360, 100802, 63836, 99934, 17447, 105279, 100773, 111786, 1773, 99934, 99477, 18600, 13343, 3837, 30534, 99805, 99610, 99934, 3837, 51463, 99477, 99495, 8997, 99495, 102172, 77540, 110869, 75061, 3837, 97639, 104134, 71134, 17447, 16628, 102214, 1773, 18493, 102458, 111678, 104444, 102045, 1773, 111678, 33447, 80158, 30440, 99405, 107946, 99938, 34187, 1773, 100655, 20412, 100186, 82647, 106232, 3837, 99652, 106411, 99610, 7948, 7948, 18830, 88683, 1773, 97706, 99992, 44793, 99929, 102100, 109861, 39426, 9370, 99800, 3837, 105139, 105863, 101425, 3837, 99652, 106411, 99610, 99832, 85361, 111226, 8997, 99495, 104030, 13343, 3837, 45629, 45629, 17523, 17523, 71268, 53222, 102300, 103850, 1773, 101916, 106519, 105231, 103850, 105484, 3837, 99212, 103850, 116943, 103585, 3837, 101280, 99226, 34187, 3837, 103973, 29732, 16530, 116435, 29077, 3837, 38182, 106570, 102376, 3837, 118581, 99518, 107651, 3837, 105666, 33108, 104902, 111656, 107847, 104432, 104066, 33108, 100134, 15946, 3837, 100549, 99466, 99257, 102088, 3837, 100134, 101300, 8997, 88308, 104197, 105285, 14777, 104037, 100654, 3837, 99519, 91777, 104128, 71618, 112096, 101280, 104197, 100167, 8997, 104301, 35946, 26288, 105285, 14777, 99774, 35727, 3837, 104624, 35946, 104053, 99774, 35727, 8997, 26288, 105285, 14777, 9370, 104030, 3837, 106047, 104071, 106954, 99366, 3837, 105786, 42411, 109216, 36987, 104335, 110138, 114154, 100003, 5691, 6313, 120144, 119557, 122204, 121285, 119773, 121059, 119142, 170, 222, 254, 5691, 8997, 97639, 92133, 104059, 99857, 99525, 110792, 24968, 11622, 102634, 101953, 99360, 63703, 27091, 71268, 101413, 34794, 3837, 114312, 100802, 63836, 1773, 99518, 11622, 72990, 100241, 99190, 2073, 121999, 44729, 854, 99360, 100050, 104957, 18493, 104135, 3837, 100420, 87256, 101358, 106804, 109703, 115257, 3837, 99654, 3837, 103952, 114154, 80158, 19108, 34187, 8997, 106356, 99314, 101180, 3837, 101952, 97639, 99190, 99692, 114154, 36587, 36987, 26288, 38182, 104300, 3837, 99243, 102284, 16530, 108738, 3837, 105603, 34187, 87256, 29258, 99190, 100003, 75758, 35946, 108954, 36987, 104389, 99232, 103554, 111031, 100854, 114154, 106041, 105603, 100373, 81264, 105420, 3837, 35946, 101144, 18947, 110721, 5122, 100561, 36407, 100369, 26288, 106099, 100822, 3837, 99360, 118579, 17447, 104165, 105151, 107495, 100194, 3837, 99934, 18493, 104135, 8997, 113573, 100644, 104458, 106356, 105511, 3837, 35946, 111256, 99822, 108032, 85336, 105275, 18947, 107000, 104150, 89012, 106356, 99405, 3837, 112696, 109785, 108927, 74763, 101161, 3837, 110567, 106356, 70927, 36407, 8997, 103083, 106356, 70927, 36407, 103920, 3837, 110267, 107000, 113121, 101255, 17447, 109703, 115257, 1773, 106356, 101161, 3837, 100549, 100523, 74763, 107651, 1773, 14777, 112594, 112594, 106578, 9370, 79599, 101233, 101286, 114177, 109003, 9370, 110914, 3837, 100261, 99331, 99164, 106356, 101954, 107288, 101625, 3837, 21, 15, 99408, 110350, 9370, 109703, 115257, 74763, 100421, 27733, 99164, 103952, 21, 15, 18947, 105514, 8997, 106356, 102203, 46306, 109703, 115257, 3837, 97639, 55286, 93149, 106800, 9370, 107000, 1773, 35946, 99677, 32648, 14777, 27733, 3837, 99360, 107000, 101913, 118509, 103077, 101537, 18493, 106356, 9370, 104488, 8997, 108668, 121769, 6313, 106356, 99518, 100514, 91777, 97706, 99914, 34187, 6313, 198, 102376, 100090, 7948, 102788, 105523, 107477, 65676, 5691, 11319, 99381, 241, 99971, 100042, 5691, 8997, 105285, 40820, 118666, 3837, 101935, 80158, 101611, 35946, 109195, 3837, 36587, 100644, 30534, 26939, 106356, 5373, 105943, 45629, 100090, 7948, 1773, 35946, 111912, 3837, 102483, 99226, 34187, 3837, 110665, 109195, 1773, 111505, 99391, 99938, 3837, 109585, 16628, 102214, 3837, 80158, 33108, 102208, 5373, 101935, 100018, 99901, 39953, 104374, 106356, 45629, 8997, 106356, 45629, 18493, 99474, 16872, 3837, 100343, 101467, 99828, 110070, 80158, 99495, 1773, 35946, 104789, 104990, 106356, 45629, 3837, 106356, 5373, 105943, 106779, 18493, 102458, 111801, 34187, 1773, 35946, 14777, 101038, 106356, 5373, 105943, 3837, 80158, 102483, 29490, 99882, 109412, 36987, 106356, 5373, 105943, 3837, 97639, 36407, 116416, 100090, 7948, 34187, 75758, 106356, 5373, 105943, 99350, 106397, 29490, 48738, 18947, 101677, 8997, 105480, 106356, 5373, 105943, 45629, 3837, 99650, 80158, 104169, 99674, 104686, 106678, 100413, 3837, 18830, 104618, 5373, 18830, 100443, 5373, 18830, 109378, 104008, 1773, 35946, 104203, 99405, 3837, 106356, 104203, 107557, 36987, 100134, 108702, 3837, 104710, 101300, 55807, 39165, 102654, 35946, 118566, 56006, 103982, 111244, 101300, 13343, 3837, 106356, 102483, 29490, 103206, 3837, 113093, 101651, 35946, 99929, 105414, 99518, 117045, 90395, 115833, 46944, 108052, 8997, 35946, 102483, 29490, 29077, 102346, 108052, 3837, 54926, 36587, 102570, 1773, 77288, 105519, 3837, 35946, 106961, 101181, 101208, 100626, 101235, 104701, 3837, 99999, 3837, 35946, 116091, 28291, 103421, 5122, 18493, 16628, 100695, 69249, 3837, 103962, 101896, 116240, 29490, 100134, 3837, 100627, 100716, 3837, 108287, 106961, 101181, 104701, 8997, 111505, 15946, 99938, 3837, 106235, 108526, 34187, 106356, 5373, 105943, 3837, 99901, 39953, 104122, 34187, 8997, 100090, 7948, 115672, 20412, 14224, 65676, 5691, 11319, 99381, 241, 99971, 5691, 8997, 105285, 40820, 3837, 35946, 112181, 109195, 3837, 109585, 16628, 102214, 5373, 16628, 113233, 33108, 16628, 110383, 3837, 101077, 99557, 110961, 100626, 117999, 14053, 85336, 101849, 106356, 45629, 85336, 100090, 7948, 3837, 35946, 30440, 102313, 34187, 8997, 101849, 106356, 45629, 18493, 42144, 109, 17447, 3837, 99486, 85336, 99582, 100230, 100696, 3837, 99165, 59258, 1773, 102865, 45629, 109967, 106825, 99593, 100408, 107500, 3837, 105285, 40820, 85336, 100090, 101506, 108560, 42140, 3837, 107500, 17447, 54926, 46944, 34794, 99901, 24156, 104338, 3837, 101228, 102248, 115507, 9973, 6313, 35946, 54926, 70790, 100371, 104338, 3837, 108432, 18830, 110961, 102865, 102144, 1773, 16530, 106868, 80158, 99495, 42144, 109, 17447, 3837, 104853, 39953, 3837, 99540, 17254, 99246, 103287, 99639, 59956, 59956, 102199, 97706, 99992, 45629, 31235, 113911, 102297, 59074, 3837, 101935, 111267, 105275, 97084, 105950, 1773, 117222, 3837, 80158, 104356, 101849, 106356, 103348, 99333, 27091, 29490, 33108, 97639, 116657, 3837, 35946, 99694, 39426, 103427, 36987, 101849, 99989, 75107, 91050, 7948, 52801, 75758, 101849, 106356, 36587, 36987, 107924, 52801, 6313, 107924, 52801, 75758, 107484, 3837, 80158, 99213, 99164, 97639, 104071, 42411, 45629, 3837, 101849, 106356, 45629, 106789, 63703, 99432, 3837, 30440, 106488, 99314, 99180, 103425, 103138, 103138, 3837, 116493, 99847, 17340, 34187, 8997, 14777, 104400, 101849, 106356, 45629, 3837, 99650, 113641, 102871, 106678, 3837, 18830, 100857, 44729, 5373, 108202, 102030, 5373, 100180, 44729, 5373, 101580, 99251, 14053, 3837, 100132, 109366, 99405, 3837, 35946, 104203, 99405, 99164, 100413, 3837, 104203, 101952, 100234, 1773, 99650, 97706, 107557, 3837, 100134, 108702, 3837, 104710, 101300, 8997, 97639, 99437, 99610, 99437, 99610, 80158, 26939, 99405, 117371, 20450, 34187, 3837, 35946, 26288, 39426, 26288, 39426, 29490, 99405, 99164, 3837, 101849, 99680, 11622, 46944, 99491, 99491, 30709, 100050, 100749, 3837, 104169, 99495, 108223, 100749, 100167, 102461, 3837, 35946, 100399, 117459, 3837, 101228, 102308, 99226, 34187, 8997, 109327, 99938, 106735, 3837, 101849, 105943, 115833, 46944, 108052, 3837, 100549, 35946, 102231, 100134, 1773, 35946, 102483, 29490, 111682, 108052, 3837, 105317, 70074, 2073, 102570, 33590, 108526, 34187, 101849, 105943, 3837, 20742, 117999, 80158, 113305, 39953, 36605, 97639, 104122, 34187, 8997, 108668, 3837, 100090, 7948, 100681, 30440, 88051, 52801, 9973, 6313, 151645, -100, 51413, 13422, 7777, 11670, 36600, 16502, 645, 37343, 644, 35246, 13, 40436, 2636, 2922, 3346, 1424, 4844, 323, 21511, 10450, 645, 13, 472, 73535, 13, 151645]
+[INFO:swift] [LABELS] [-100 * 1]平凡的学习、工作、生活中，大家都写过作文吧，作文是由文字组成，经过人的思想考虑，通过语言组织来表达一个主题意义的文体。那么一般作文是怎么写的呢？以下是小编精心整理的大年初一作文7篇，欢迎阅读，希望大家能够喜欢。
+大年初一作文 篇1
+“放鞭炮，贴春联，穿新衣，收红包”是每年春节都必须做的几件事，今年也不例外。不过，过年拜年是最有趣的。
+大年初一一早，我早早地起了床，穿好新衣，好好地打扮了一下，我上身穿着白色羊绒衫和黑白相间的小裙子，下身穿着紧身的打底裤，外面套上一件渐变色的羽绒衫，搭配得自然协调，真是美极了！一切都准备好了，爸爸开着小汽车，带着一家人，向老家前进！
+此时的我激动极了！这是我盼望已久的春节��！
+一路上，我们说说笑笑，看看路边的风景，也别是一番风趣。公路两旁的大树高大挺拔，小草绿油油的，穿着一件雪白雪白的棉袄，真是一幅美丽的冬日画卷��！
+终于到老家了，我开心地蹦下了车，拎着手提包，拉着爸爸妈妈的手一起去拜年了！
+首先，我们到了姨奶奶家，我走了过去，祝姨奶奶：“福如东海寿比南山�！币棠棠涕_心地笑了。抓了一大把糖给了我，我把糖放进了包里，开��极了。心想：现在人们的生活水平提高了！不愁吃，不愁穿的，真好。
+接下来，我们去了三姑妈家，爸爸一声大喊：“拜年的到了！”我走了上去祝三姑妈：“财源滚滚！” 三姑妈家乐开了花，连连称赞我。
+随后，我们还去了姑奶奶，二舅，二姑妈……家。
+今天，我收获了很多，同时也很快乐！新年Happy！
+梅花伴雪舞，祥龙迎春归。和光布德泽，万物沐新辉。在这个短暂的寒假里，我和老妈和小姨一家一起过年，为什么说大年初一是惊险的呢？请听我慢慢道来。
+往常大年初一是在鞭炮声中度过，于是我们就计划早早吃过饭到院子里，放孔明灯。我三下五除二把三个孔明灯打开和老妈写下祝福，我们拿着打火机和孔明灯，兴冲冲的来到院子里，准备放。
+只见我和我姨夫把孔明灯提起来，让老妈点燃底部的.蜡烛。我们耐心等待着，大约过了一分钟，我和我姨夫就放开孔明灯，只见孔明灯自己缓缓上升，里面的烛光摇曳著，我们的目光也随着孔明灯的上升�？勺屏覀内f万没想到的是：
+我们大家的心都悬起来了，刚开始的新鲜感也没有了，我生怕孔明灯会烧了电线，心突突的跳，手心里出了汗。此刻，我们大家只希望孔明灯能上升，别停留在电线旁。我那颗忐忑不安的心越跳越快，我都不敢想象惨绝人寰的恶果。
+在全家人的“痴望”中，我突然想起要不要报警，于是我就说：“要不要报警，万一孔明灯的金属丝导电，怎么办？”正当我们准备打电话时，让我们意想不到又欣喜万分的事发生了—“孔明灯又徐徐上升了！”“原来是里面的热空气太少，”我松了一口气，“像热气球一样，吓死我了�！贝蠹叶妓闪艘豢跉�，如释重担。这真是虚惊一场！
+这个孔明灯让我们过了个惊险的大年初一，但也让我们难忘，我也要提醒大家过年时放鞭炮、放孔明灯和别的爆竹时，注意安全，别像我们这样惊险。
+春节是我国每年最盛大隆重的节日。我的家乡处于南方，那我就向大家介绍一下南方的春节习俗吧。
+大年三十，小孩和大人们都要早早的起床，洗漱好了，我们就开始吃早饭了，茶叶蛋是不可少的食物，它象征著团团圆圆。粥也是不可少的，它象征著多子多福。
+吃完了早饭，我们就开始贴春联了。首先把春联移到正确的地方，再把四个角贴上透明胶就行了。贴福字时，要倒著贴，表示福到了。
+到了下午两点多钟，我们就要换上新衣服。在门口点燃炮竹。点燃后就可吃年夜饭了。鱼是不可少的食物，它象征著年年有余。还有一道既营养还可口的菜，那就是玉米粒，它象征著荣华富贵。
+到了晚上时，家家户户都放起了烟花。天空顿时变成了烟花的世界，那烟花绚丽多彩，美丽极了，让人目不暇接，过完了春节，新的一年又开始了，大人和小孩们都进入了紧张的工作和学习中，祝大家工作顺利，学习进步。
+今年的大年初一有点特别，因为老天下起了一场美丽的大雪。
+这就是我大年初一的一天，这也是我快乐的一天。
+大年初一的晚上，弟弟来到我家玩，我和他商量：“咱们来做灯笼吧�！钡艿芤豢诒愦馍�。
+我们找了一个废酒盒子；用剪刀把四面都挖空，留住四个角。又用土办法做“糨子”把纸粘在上面，里面再固定一根蜡烛，这样，我们的灯笼就成功了。
+爷爷走过来，看着我们做好的灯笼说：“大过年的，白颜色不吉利，扔了再重做吧！”我心想：“人家花半天工夫做的灯笼就这样扔掉？”忽然，我有了个主意：搬来两个大饮料瓶，把瓶子上红色标签撕下来，贴在上面。
+恰好今天又是爷爷生日，我用自己的零花钱去买了个蛋糕回来给爷爷吃，回到家才发现亲人也来了，只剩下爷爷没来。
+趁爷爷没来的时候，我把蛋糕拿出来插上蜡烛。爷爷来了，祝寿也开始了。一簇簇燃烧的火苗组成一朵吉祥的莲花，映照着爷爷幸福的脸庞，60根彩色的蜡烛也跳动着我们的60个祝福。
+爷爷吹完蜡烛，我们开始分享美味的蛋糕。我灵机一动，把蛋糕上的奶油一下子抹在爷爷的脸上。
+哇噻！爷爷又返老还童了！
+春节拜年对我来讲是一件非�？鞓肥虑�。
+年初二一大早，妈妈就催我起床，说今天要到爷爷、奶奶家拜年。我一听，高兴极了，连忙起床。吃过早饭，穿上新衣服，就和爸爸、妈妈一起坐车前往爷爷家。
+爷爷家在乡下，汽车开了不到半小时就到了。我还没走到爷爷家，爷爷、奶奶就已经在门口等候了。我一看到爷爷、奶奶，就高兴地叫起来了：“爷爷、奶奶，我们来给你们拜年了！”爷爷、奶奶乐呵呵地笑个不停。
+进了爷爷、奶奶家，他们就给我拿了很多好吃东西，有水果、有糖、有花生等等。我一边吃，爷爷一边问我：“学习好不好，有没有进步”。当听说我学习成绩比以前有很大进步时，爷爷高兴地笑了，连连夸我既聪明又懂事，并给了我一个红包。
+我高兴地接过了红包，连说谢谢。但我知道，我与其他同学相比还有很大差距，所以，我暗暗发誓：在新一年里，一定要更加刻苦地学习，提高成绩，缩小与其他同学差距。
+吃过中饭，我们就告别了爷爷、奶奶，坐车回家了。
+拜年对我来说是件非�？鞓肥�。
+年初二，我早早起床，穿上新衣服、新裤子和新鞋子，准备跟爸爸妈妈还有舅舅……去舅爷爷家去拜年，我可开心了。
+舅爷爷家在墱上，就是去贵池方向，很近。在我家门前乘坐了一辆公交车，年初二去拜年人还真多，公交车上连一个空坐位都没有，真是人群拥挤��！我连站地方都没有，还好有爸爸妈妈在我身边。不一会儿就到了墱上，下了车，印入眼帘是一排排房子还有一家最耀眼购物城，妈妈在里面买了些礼物。不远处，就看见舅爷爷笑容满面地和我们打招呼，我脱口而出：“舅爷爷新年好！”舅爷爷说：“新年好！新年好！”说完，就领着我们来到他家，舅爷爷家住在四楼，可把我走气喘吁吁，实在是太累人了。
+一走进舅爷爷家，他们拿来好多好吃，有瓜子、杏仁、松子、葡萄干……，都是我喜欢吃，我一边吃着东西，一边看着电视。他们还问我，学习好不好，有没有进步。
+我们谈著谈著就到吃午饭时间了，我大口大口地吃着，舅妈用一个非常非常小纸杯，给我到了一小杯雪碧，我喝了一口，真是爽极了。
+吃完饭过后，舅奶奶给了我一个红包，祝我好好学习。我高兴地接过红包，说了声“谢谢”，告别了舅奶奶，表舅舅就开着车送我们回家了。
+哇，拜年感觉可真好��！<|im_end|>[-100 * 1]aju kan nggon ku kaie uwong eh. Kabeh enggo handphone and smartphone waie. Hahaha.<|im_end|>
+[INFO:swift] The TrainArguments will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/args.json
+[INFO:swift] model: Qwen3ForCausalLM(
+  (model): Qwen3Model(
+    (embed_tokens): Embedding(151936, 2560)
+    (layers): ModuleList(
+      (0-35): 36 x Qwen3DecoderLayer(
+        (self_attn): Qwen3Attention(
+          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
+          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
+          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
+          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
+          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
+        )
+        (mlp): Qwen3MLP(
+          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
+          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
+      )
+    )
+    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
+    (rotary_emb): Qwen3RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen3ForCausalLM: 4022.4681M Params (4022.4681M Trainable [100.0000%]), 0.0001M Buffers.
+[WARNING:swift] Using IterableDataset, setting args.dataloader_num_workers to 1.
+/mnt/nvme1/luoyingfeng/ms-swift-3.7.3/swift/trainers/mixin.py:94: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/logging.jsonl
+Train:   0%|          | 0/5000 [00:00<?, ?it/s]/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+[INFO:swift] use_logits_to_keep: True
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:   0%|          | 1/5000 [00:27<37:30:30, 27.01s/it]                                                          {'loss': 2.10210896, 'token_acc': 0.57490628, 'grad_norm': 0.66467696, 'learning_rate': 8e-08, 'memory(GiB)': 122.92, 'train_speed(iter/s)': 0.023839, 'epoch': 0.0, 'global_step/max_steps': '1/5000', 'percentage': '0.02%', 'elapsed_time': '27s', 'remaining_time': '1d 13h 30m 46s'}
+Train:   0%|          | 1/5000 [00:27<37:30:30, 27.01s/it]Train:   0%|          | 1/5000 [00:27<37:30:30, 27.01s/it]Train:   0%|          | 2/5000 [00:41<27:08:11, 19.55s/it]Train:   0%|          | 3/5000 [00:55<23:48:05, 17.15s/it]Train:   0%|          | 4/5000 [01:09<22:14:53, 16.03s/it]Train:   0%|          | 5/5000 [01:24<21:22:50, 15.41s/it]Train:   0%|          | 6/5000 [01:38<20:53:05, 15.06s/it]Train:   0%|          | 7/5000 [01:52<20:33:15, 14.82s/it]Train:   0%|          | 8/5000 [02:07<20:21:11, 14.68s/it]Train:   0%|          | 9/5000 [02:21<20:10:54, 14.56s/it]Train:   0%|          | 10/5000 [02:35<20:05:15, 14.49s/it]                                                           {'loss': 2.11970605, 'token_acc': 0.57254234, 'grad_norm': 0.65054244, 'learning_rate': 8e-07, 'memory(GiB)': 126.38, 'train_speed(iter/s)': 0.058513, 'epoch': 0.0, 'global_step/max_steps': '10/5000', 'percentage': '0.20%', 'elapsed_time': '2m 35s', 'remaining_time': '21h 37m 8s'}
+Train:   0%|          | 10/5000 [02:35<20:05:15, 14.49s/it]Train:   0%|          | 10/5000 [02:35<20:05:15, 14.49s/it]Train:   0%|          | 11/5000 [02:50<20:00:28, 14.44s/it]Train:   0%|          | 12/5000 [03:04<19:57:07, 14.40s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (133055 > 131072). Running this sequence through the model will result in indexing errors
+Train:   0%|          | 13/5000 [03:18<19:54:09, 14.37s/it]Train:   0%|          | 14/5000 [03:33<19:52:35, 14.35s/it]Train:   0%|          | 15/5000 [03:47<19:51:03, 14.34s/it]Train:   0%|          | 16/5000 [04:01<19:49:37, 14.32s/it]Train:   0%|          | 17/5000 [04:16<19:48:10, 14.31s/it]Train:   0%|          | 18/5000 [04:30<19:47:57, 14.31s/it]Train:   0%|          | 19/5000 [04:44<19:47:27, 14.30s/it]Train:   0%|          | 20/5000 [04:58<19:45:54, 14.29s/it]                                                           {'loss': 2.11793213, 'token_acc': 0.5740174, 'grad_norm': 0.57838172, 'learning_rate': 1.6e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.063724, 'epoch': 0.0, 'global_step/max_steps': '20/5000', 'percentage': '0.40%', 'elapsed_time': '4m 58s', 'remaining_time': '20h 40m 31s'}
+Train:   0%|          | 20/5000 [04:58<19:45:54, 14.29s/it]Train:   0%|          | 20/5000 [04:58<19:45:54, 14.29s/it]Train:   0%|          | 21/5000 [05:13<19:45:34, 14.29s/it]Train:   0%|          | 22/5000 [05:27<19:44:15, 14.27s/it]Train:   0%|          | 23/5000 [05:41<19:43:53, 14.27s/it]Train:   0%|          | 24/5000 [05:55<19:43:19, 14.27s/it]Train:   0%|          | 25/5000 [06:10<19:43:01, 14.27s/it]Train:   1%|          | 26/5000 [06:24<19:42:58, 14.27s/it]Train:   1%|          | 27/5000 [06:38<19:42:17, 14.26s/it]Train:   1%|          | 28/5000 [06:53<19:41:40, 14.26s/it]Train:   1%|          | 29/5000 [07:07<19:41:24, 14.26s/it]Train:   1%|          | 30/5000 [07:21<19:41:33, 14.26s/it]                                                           {'loss': 2.10272675, 'token_acc': 0.56717201, 'grad_norm': 0.43268162, 'learning_rate': 2.4e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.06572, 'epoch': 0.01, 'global_step/max_steps': '30/5000', 'percentage': '0.60%', 'elapsed_time': '7m 21s', 'remaining_time': '20h 19m 10s'}
+Train:   1%|          | 30/5000 [07:21<19:41:33, 14.26s/it]Train:   1%|          | 30/5000 [07:21<19:41:33, 14.26s/it]Train:   1%|          | 31/5000 [07:35<19:41:54, 14.27s/it]Train:   1%|          | 32/5000 [07:50<19:42:40, 14.28s/it]Train:   1%|          | 33/5000 [08:04<19:41:33, 14.27s/it]Train:   1%|          | 34/5000 [08:18<19:40:50, 14.27s/it]Train:   1%|          | 35/5000 [08:32<19:39:51, 14.26s/it]Train:   1%|          | 36/5000 [08:47<19:40:28, 14.27s/it]Train:   1%|          | 37/5000 [09:01<19:40:22, 14.27s/it]Train:   1%|          | 38/5000 [09:15<19:39:38, 14.26s/it]Train:   1%|          | 39/5000 [09:29<19:39:22, 14.26s/it]Train:   1%|          | 40/5000 [09:44<19:40:24, 14.28s/it]                                                           {'loss': 2.10072212, 'token_acc': 0.57247045, 'grad_norm': 0.40563241, 'learning_rate': 3.2e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.066754, 'epoch': 0.01, 'global_step/max_steps': '40/5000', 'percentage': '0.80%', 'elapsed_time': '9m 44s', 'remaining_time': '20h 7m 31s'}
+Train:   1%|          | 40/5000 [09:44<19:40:24, 14.28s/it]Train:   1%|          | 40/5000 [09:44<19:40:24, 14.28s/it]Train:   1%|          | 41/5000 [09:58<19:40:17, 14.28s/it]Train:   1%|          | 42/5000 [10:12<19:39:59, 14.28s/it]Train:   1%|          | 43/5000 [10:27<19:39:31, 14.28s/it]Train:   1%|          | 44/5000 [10:41<19:38:39, 14.27s/it]Train:   1%|          | 45/5000 [10:55<19:37:44, 14.26s/it]Train:   1%|          | 46/5000 [11:09<19:36:51, 14.25s/it]Train:   1%|          | 47/5000 [11:24<19:36:09, 14.25s/it]Train:   1%|          | 48/5000 [11:38<19:35:34, 14.24s/it]Train:   1%|          | 49/5000 [11:52<19:35:19, 14.24s/it]Train:   1%|          | 50/5000 [12:06<19:35:19, 14.25s/it]                                                           {'loss': 2.06888275, 'token_acc': 0.56575297, 'grad_norm': 0.36015499, 'learning_rate': 4e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.067409, 'epoch': 0.01, 'global_step/max_steps': '50/5000', 'percentage': '1.00%', 'elapsed_time': '12m 6s', 'remaining_time': '19h 59m 14s'}
+Train:   1%|          | 50/5000 [12:06<19:35:19, 14.25s/it]Train:   1%|          | 50/5000 [12:06<19:35:19, 14.25s/it]Train:   1%|          | 51/5000 [12:21<19:35:34, 14.25s/it]Train:   1%|          | 52/5000 [12:35<19:35:03, 14.25s/it]Train:   1%|          | 53/5000 [12:49<19:34:26, 14.24s/it]Train:   1%|          | 54/5000 [13:03<19:34:39, 14.25s/it]Train:   1%|          | 55/5000 [13:18<19:33:28, 14.24s/it]Train:   1%|          | 56/5000 [13:32<19:34:46, 14.26s/it]Train:   1%|          | 57/5000 [13:46<19:34:28, 14.26s/it]Train:   1%|          | 58/5000 [14:00<19:34:14, 14.26s/it]Train:   1%|          | 59/5000 [14:15<19:33:54, 14.26s/it]Train:   1%|          | 60/5000 [14:29<19:35:44, 14.28s/it]                                                           {'loss': 2.0624773, 'token_acc': 0.57411491, 'grad_norm': 0.35430777, 'learning_rate': 4.8e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.067846, 'epoch': 0.01, 'global_step/max_steps': '60/5000', 'percentage': '1.20%', 'elapsed_time': '14m 29s', 'remaining_time': '19h 53m 2s'}
+Train:   1%|          | 60/5000 [14:29<19:35:44, 14.28s/it]Train:   1%|          | 60/5000 [14:29<19:35:44, 14.28s/it]Train:   1%|          | 61/5000 [14:43<19:35:30, 14.28s/it]Train:   1%|          | 62/5000 [14:57<19:33:53, 14.26s/it]Train:   1%|▏         | 63/5000 [15:12<19:34:06, 14.27s/it]Train:   1%|▏         | 64/5000 [15:26<19:34:19, 14.27s/it]Train:   1%|▏         | 65/5000 [15:40<19:34:25, 14.28s/it]Train:   1%|▏         | 66/5000 [15:55<19:34:10, 14.28s/it]Train:   1%|▏         | 67/5000 [16:09<19:33:41, 14.28s/it]Train:   1%|▏         | 68/5000 [16:23<19:35:10, 14.30s/it]Train:   1%|▏         | 69/5000 [16:37<19:34:15, 14.29s/it]Train:   1%|▏         | 70/5000 [16:52<19:34:10, 14.29s/it]                                                           {'loss': 2.04510059, 'token_acc': 0.57712534, 'grad_norm': 0.41511905, 'learning_rate': 5.6e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068148, 'epoch': 0.01, 'global_step/max_steps': '70/5000', 'percentage': '1.40%', 'elapsed_time': '16m 52s', 'remaining_time': '19h 48m 11s'}
+Train:   1%|▏         | 70/5000 [16:52<19:34:10, 14.29s/it]Train:   1%|▏         | 70/5000 [16:52<19:34:10, 14.29s/it]Train:   1%|▏         | 71/5000 [17:06<19:33:22, 14.28s/it]Train:   1%|▏         | 72/5000 [17:20<19:33:16, 14.29s/it]Train:   1%|▏         | 73/5000 [17:35<19:32:23, 14.28s/it]Train:   1%|▏         | 74/5000 [17:49<19:33:21, 14.29s/it]Train:   2%|▏         | 75/5000 [18:03<19:33:14, 14.29s/it]Train:   2%|▏         | 76/5000 [18:17<19:32:54, 14.29s/it]Train:   2%|▏         | 77/5000 [18:32<19:32:43, 14.29s/it]Train:   2%|▏         | 78/5000 [18:46<19:31:52, 14.29s/it]Train:   2%|▏         | 79/5000 [19:00<19:30:45, 14.27s/it]Train:   2%|▏         | 80/5000 [19:15<19:29:44, 14.27s/it]                                                           {'loss': 2.03615875, 'token_acc': 0.57699782, 'grad_norm': 0.35204482, 'learning_rate': 6.4e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068378, 'epoch': 0.02, 'global_step/max_steps': '80/5000', 'percentage': '1.60%', 'elapsed_time': '19m 15s', 'remaining_time': '19h 43m 54s'}
+Train:   2%|▏         | 80/5000 [19:15<19:29:44, 14.27s/it]Train:   2%|▏         | 80/5000 [19:15<19:29:44, 14.27s/it]Train:   2%|▏         | 81/5000 [19:29<19:30:02, 14.27s/it]Train:   2%|▏         | 82/5000 [19:43<19:29:46, 14.27s/it]Train:   2%|▏         | 83/5000 [19:57<19:30:40, 14.29s/it]Train:   2%|▏         | 84/5000 [20:12<19:29:28, 14.27s/it]Train:   2%|▏         | 85/5000 [20:26<19:28:28, 14.26s/it]Train:   2%|▏         | 86/5000 [20:40<19:27:39, 14.26s/it]Train:   2%|▏         | 87/5000 [20:54<19:27:26, 14.26s/it]Train:   2%|▏         | 88/5000 [21:09<19:26:06, 14.24s/it]Train:   2%|▏         | 89/5000 [21:23<19:26:25, 14.25s/it]Train:   2%|▏         | 90/5000 [21:37<19:26:05, 14.25s/it]                                                           {'loss': 2.01835823, 'token_acc': 0.57656891, 'grad_norm': 0.33876544, 'learning_rate': 7.2e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068569, 'epoch': 0.02, 'global_step/max_steps': '90/5000', 'percentage': '1.80%', 'elapsed_time': '21m 37s', 'remaining_time': '19h 39m 52s'}
+Train:   2%|▏         | 90/5000 [21:37<19:26:05, 14.25s/it]Train:   2%|▏         | 90/5000 [21:37<19:26:05, 14.25s/it]Train:   2%|▏         | 91/5000 [21:51<19:25:58, 14.25s/it]Train:   2%|▏         | 92/5000 [22:06<19:25:12, 14.24s/it]Train:   2%|▏         | 93/5000 [22:20<19:25:33, 14.25s/it]Train:   2%|▏         | 94/5000 [22:34<19:24:43, 14.24s/it]Train:   2%|▏         | 95/5000 [22:48<19:23:54, 14.24s/it]Train:   2%|▏         | 96/5000 [23:03<19:24:03, 14.24s/it]Train:   2%|▏         | 97/5000 [23:17<19:23:39, 14.24s/it]Train:   2%|▏         | 98/5000 [23:31<19:23:16, 14.24s/it]Train:   2%|▏         | 99/5000 [23:45<19:23:22, 14.24s/it]Train:   2%|▏         | 100/5000 [24:00<19:24:19, 14.26s/it]                                                            {'loss': 2.02236614, 'token_acc': 0.57695617, 'grad_norm': 0.34318444, 'learning_rate': 8e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068728, 'epoch': 0.02, 'global_step/max_steps': '100/5000', 'percentage': '2.00%', 'elapsed_time': '24m 0s', 'remaining_time': '19h 36m 4s'}
+Train:   2%|▏         | 100/5000 [24:00<19:24:19, 14.26s/it]Train:   2%|▏         | 100/5000 [24:00<19:24:19, 14.26s/it]Train:   2%|▏         | 101/5000 [24:14<19:23:06, 14.24s/it]Train:   2%|▏         | 102/5000 [24:28<19:23:35, 14.25s/it]Train:   2%|▏         | 103/5000 [24:42<19:23:27, 14.26s/it]Train:   2%|▏         | 104/5000 [24:57<19:22:54, 14.25s/it]Train:   2%|▏         | 105/5000 [25:11<19:22:16, 14.25s/it]Train:   2%|▏         | 106/5000 [25:25<19:21:57, 14.25s/it]Train:   2%|▏         | 107/5000 [25:39<19:21:48, 14.25s/it]Train:   2%|▏         | 108/5000 [25:54<19:22:26, 14.26s/it]Train:   2%|▏         | 109/5000 [26:08<19:22:30, 14.26s/it]Train:   2%|▏         | 110/5000 [26:22<19:22:40, 14.27s/it]                                                            {'loss': 1.9887167, 'token_acc': 0.58016937, 'grad_norm': 0.35282964, 'learning_rate': 8.8e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068855, 'epoch': 0.02, 'global_step/max_steps': '110/5000', 'percentage': '2.20%', 'elapsed_time': '26m 22s', 'remaining_time': '19h 32m 35s'}
+Train:   2%|▏         | 110/5000 [26:22<19:22:40, 14.27s/it]Train:   2%|▏         | 110/5000 [26:22<19:22:40, 14.27s/it]Train:   2%|▏         | 111/5000 [26:36<19:23:28, 14.28s/it]Train:   2%|▏         | 112/5000 [26:51<19:22:48, 14.27s/it]Train:   2%|▏         | 113/5000 [27:05<19:22:06, 14.27s/it]Train:   2%|▏         | 114/5000 [27:19<19:21:49, 14.27s/it]Train:   2%|▏         | 115/5000 [27:34<19:22:19, 14.28s/it]Train:   2%|▏         | 116/5000 [27:48<19:21:22, 14.27s/it]Train:   2%|▏         | 117/5000 [28:02<19:19:52, 14.25s/it]Train:   2%|▏         | 118/5000 [28:16<19:21:03, 14.27s/it]Train:   2%|▏         | 119/5000 [28:31<19:22:03, 14.28s/it]Train:   2%|▏         | 120/5000 [28:45<19:20:57, 14.27s/it]                                                            {'loss': 1.97573853, 'token_acc': 0.57947956, 'grad_norm': 0.3538188, 'learning_rate': 9.6e-06, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.068954, 'epoch': 0.02, 'global_step/max_steps': '120/5000', 'percentage': '2.40%', 'elapsed_time': '28m 45s', 'remaining_time': '19h 29m 24s'}
+Train:   2%|▏         | 120/5000 [28:45<19:20:57, 14.27s/it]Train:   2%|▏         | 120/5000 [28:45<19:20:57, 14.27s/it]Train:   2%|▏         | 121/5000 [28:59<19:20:42, 14.27s/it]Train:   2%|▏         | 122/5000 [29:13<19:20:52, 14.28s/it]Train:   2%|▏         | 123/5000 [29:28<19:20:39, 14.28s/it]Train:   2%|▏         | 124/5000 [29:42<19:20:54, 14.29s/it]Train:   2%|▎         | 125/5000 [29:56<19:19:29, 14.27s/it]Train:   3%|▎         | 126/5000 [30:11<19:19:15, 14.27s/it]Train:   3%|▎         | 127/5000 [30:25<19:19:15, 14.27s/it]Train:   3%|▎         | 128/5000 [30:39<19:19:45, 14.28s/it]Train:   3%|▎         | 129/5000 [30:53<19:18:50, 14.27s/it]Train:   3%|▎         | 130/5000 [31:08<19:19:20, 14.28s/it]                                                            {'loss': 1.98477173, 'token_acc': 0.5853757, 'grad_norm': 0.45706409, 'learning_rate': 1.04e-05, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.069035, 'epoch': 0.03, 'global_step/max_steps': '130/5000', 'percentage': '2.60%', 'elapsed_time': '31m 8s', 'remaining_time': '19h 26m 24s'}
+Train:   3%|▎         | 130/5000 [31:08<19:19:20, 14.28s/it]Train:   3%|▎         | 130/5000 [31:08<19:19:20, 14.28s/it]Train:   3%|▎         | 131/5000 [31:22<19:19:00, 14.28s/it]Train:   3%|▎         | 132/5000 [31:36<19:19:09, 14.29s/it]Train:   3%|▎         | 133/5000 [31:51<19:18:54, 14.29s/it]Train:   3%|▎         | 134/5000 [32:05<19:17:20, 14.27s/it]Train:   3%|▎         | 135/5000 [32:19<19:17:25, 14.27s/it]Train:   3%|▎         | 136/5000 [32:33<19:17:50, 14.28s/it]Train:   3%|▎         | 137/5000 [32:48<19:17:23, 14.28s/it]Train:   3%|▎         | 138/5000 [33:02<19:16:46, 14.28s/it]Train:   3%|▎         | 139/5000 [33:16<19:16:23, 14.27s/it]Train:   3%|▎         | 140/5000 [33:30<19:16:35, 14.28s/it]                                                            {'loss': 1.96514053, 'token_acc': 0.58748601, 'grad_norm': 0.41253319, 'learning_rate': 1.12e-05, 'memory(GiB)': 126.42, 'train_speed(iter/s)': 0.069106, 'epoch': 0.03, 'global_step/max_steps': '140/5000', 'percentage': '2.80%', 'elapsed_time': '33m 30s', 'remaining_time': '19h 23m 28s'}
+Train:   3%|▎         | 140/5000 [33:30<19:16:35, 14.28s/it]Train:   3%|▎         | 140/5000 [33:30<19:16:35, 14.28s/it]Train:   3%|▎         | 141/5000 [33:45<19:16:36, 14.28s/it]Train:   3%|▎         | 142/5000 [33:59<19:16:48, 14.29s/it]Train:   3%|▎         | 143/5000 [34:13<19:16:19, 14.28s/it]Train:   3%|▎         | 144/5000 [34:28<19:16:55, 14.29s/it]Train:   3%|▎         | 145/5000 [34:42<19:17:30, 14.30s/it]Train:   3%|▎         | 146/5000 [34:56<19:18:03, 14.31s/it]Train:   3%|▎         | 147/5000 [35:11<19:17:30, 14.31s/it]Train:   3%|▎         | 148/5000 [35:25<19:16:35, 14.30s/it]Train:   3%|▎         | 149/5000 [35:39<19:17:30, 14.32s/it]Train:   3%|▎         | 150/5000 [35:54<19:16:26, 14.31s/it]                                                            {'loss': 1.9439106, 'token_acc': 0.58958162, 'grad_norm': 0.42743185, 'learning_rate': 1.2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069158, 'epoch': 0.03, 'global_step/max_steps': '150/5000', 'percentage': '3.00%', 'elapsed_time': '35m 54s', 'remaining_time': '19h 20m 46s'}
+Train:   3%|▎         | 150/5000 [35:54<19:16:26, 14.31s/it]Train:   3%|▎         | 150/5000 [35:54<19:16:26, 14.31s/it]Train:   3%|▎         | 151/5000 [36:08<19:16:26, 14.31s/it]Train:   3%|▎         | 152/5000 [36:22<19:14:53, 14.29s/it]Train:   3%|▎         | 153/5000 [36:36<19:14:04, 14.29s/it]Train:   3%|▎         | 154/5000 [36:51<19:14:30, 14.29s/it]Train:   3%|▎         | 155/5000 [37:05<19:15:29, 14.31s/it]Train:   3%|▎         | 156/5000 [37:19<19:14:45, 14.30s/it]Train:   3%|▎         | 157/5000 [37:34<19:14:26, 14.30s/it]Train:   3%|▎         | 158/5000 [37:48<19:13:54, 14.30s/it]Train:   3%|▎         | 159/5000 [38:02<19:13:13, 14.29s/it]Train:   3%|▎         | 160/5000 [38:16<19:12:30, 14.29s/it]                                                            {'loss': 1.95916233, 'token_acc': 0.58924333, 'grad_norm': 0.41938034, 'learning_rate': 1.28e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069208, 'epoch': 0.03, 'global_step/max_steps': '160/5000', 'percentage': '3.20%', 'elapsed_time': '38m 16s', 'remaining_time': '19h 18m 2s'}
+Train:   3%|▎         | 160/5000 [38:16<19:12:30, 14.29s/it]Train:   3%|▎         | 160/5000 [38:16<19:12:30, 14.29s/it]Train:   3%|▎         | 161/5000 [38:31<19:12:44, 14.29s/it]Train:   3%|▎         | 162/5000 [38:45<19:12:23, 14.29s/it]Train:   3%|▎         | 163/5000 [38:59<19:11:54, 14.29s/it]Train:   3%|▎         | 164/5000 [39:14<19:10:22, 14.27s/it]Train:   3%|▎         | 165/5000 [39:28<19:09:39, 14.27s/it]Train:   3%|▎         | 166/5000 [39:42<19:09:13, 14.26s/it]Train:   3%|▎         | 167/5000 [39:56<19:08:45, 14.26s/it]Train:   3%|▎         | 168/5000 [40:11<19:08:57, 14.27s/it]Train:   3%|▎         | 169/5000 [40:25<19:08:27, 14.26s/it]Train:   3%|▎         | 170/5000 [40:39<19:08:03, 14.26s/it]                                                            {'loss': 1.94503136, 'token_acc': 0.58951417, 'grad_norm': 0.36481732, 'learning_rate': 1.36e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069259, 'epoch': 0.03, 'global_step/max_steps': '170/5000', 'percentage': '3.40%', 'elapsed_time': '40m 39s', 'remaining_time': '19h 15m 13s'}
+Train:   3%|▎         | 170/5000 [40:39<19:08:03, 14.26s/it]Train:   3%|▎         | 170/5000 [40:39<19:08:03, 14.26s/it]Train:   3%|▎         | 171/5000 [40:53<19:08:36, 14.27s/it]Train:   3%|▎         | 172/5000 [41:08<19:08:56, 14.28s/it]Train:   3%|▎         | 173/5000 [41:22<19:08:08, 14.27s/it]Train:   3%|▎         | 174/5000 [41:36<19:08:32, 14.28s/it]Train:   4%|▎         | 175/5000 [41:51<19:08:04, 14.28s/it]Train:   4%|▎         | 176/5000 [42:05<19:08:27, 14.28s/it]Train:   4%|▎         | 177/5000 [42:19<19:08:22, 14.29s/it]Train:   4%|▎         | 178/5000 [42:33<19:07:06, 14.27s/it]Train:   4%|▎         | 179/5000 [42:48<19:06:00, 14.26s/it]Train:   4%|▎         | 180/5000 [43:02<19:05:42, 14.26s/it]                                                            {'loss': 1.94237328, 'token_acc': 0.58948422, 'grad_norm': 0.34816912, 'learning_rate': 1.44e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069303, 'epoch': 0.04, 'global_step/max_steps': '180/5000', 'percentage': '3.60%', 'elapsed_time': '43m 2s', 'remaining_time': '19h 12m 29s'}
+Train:   4%|▎         | 180/5000 [43:02<19:05:42, 14.26s/it]Train:   4%|▎         | 180/5000 [43:02<19:05:42, 14.26s/it]Train:   4%|▎         | 181/5000 [43:16<19:04:14, 14.25s/it]Train:   4%|▎         | 182/5000 [43:30<19:03:47, 14.24s/it]Train:   4%|▎         | 183/5000 [43:45<19:03:56, 14.25s/it]Train:   4%|▎         | 184/5000 [43:59<19:03:30, 14.25s/it]Train:   4%|▎         | 185/5000 [44:13<19:03:23, 14.25s/it]Train:   4%|▎         | 186/5000 [44:27<19:03:39, 14.25s/it]Train:   4%|▎         | 187/5000 [44:42<19:03:34, 14.26s/it]Train:   4%|▍         | 188/5000 [44:56<19:03:13, 14.25s/it]Train:   4%|▍         | 189/5000 [45:10<19:02:48, 14.25s/it]Train:   4%|▍         | 190/5000 [45:24<19:02:05, 14.25s/it]                                                            {'loss': 1.92692165, 'token_acc': 0.59268639, 'grad_norm': 0.31132489, 'learning_rate': 1.52e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069349, 'epoch': 0.04, 'global_step/max_steps': '190/5000', 'percentage': '3.80%', 'elapsed_time': '45m 24s', 'remaining_time': '19h 9m 41s'}
+Train:   4%|▍         | 190/5000 [45:24<19:02:05, 14.25s/it]Train:   4%|▍         | 190/5000 [45:24<19:02:05, 14.25s/it]Train:   4%|▍         | 191/5000 [45:39<19:01:51, 14.25s/it]Train:   4%|▍         | 192/5000 [45:53<19:02:16, 14.25s/it]Train:   4%|▍         | 193/5000 [46:07<19:01:45, 14.25s/it]Train:   4%|▍         | 194/5000 [46:21<19:01:24, 14.25s/it]Train:   4%|▍         | 195/5000 [46:36<19:03:12, 14.28s/it]Train:   4%|▍         | 196/5000 [46:50<19:02:13, 14.27s/it]Train:   4%|▍         | 197/5000 [47:04<19:03:06, 14.28s/it]Train:   4%|▍         | 198/5000 [47:19<19:02:53, 14.28s/it]Train:   4%|▍         | 199/5000 [47:33<19:02:14, 14.27s/it]Train:   4%|▍         | 200/5000 [47:47<19:01:51, 14.27s/it]                                                            {'loss': 1.92388706, 'token_acc': 0.58654898, 'grad_norm': 0.29845539, 'learning_rate': 1.6e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069385, 'epoch': 0.04, 'global_step/max_steps': '200/5000', 'percentage': '4.00%', 'elapsed_time': '47m 47s', 'remaining_time': '19h 7m 0s'}
+Train:   4%|▍         | 200/5000 [47:47<19:01:51, 14.27s/it]Train:   4%|▍         | 200/5000 [47:47<19:01:51, 14.27s/it]Train:   4%|▍         | 201/5000 [48:01<19:01:04, 14.27s/it]Train:   4%|▍         | 202/5000 [48:16<19:00:41, 14.26s/it]Train:   4%|▍         | 203/5000 [48:30<19:00:30, 14.27s/it]Train:   4%|▍         | 204/5000 [48:44<19:00:22, 14.27s/it]Train:   4%|▍         | 205/5000 [48:58<19:00:31, 14.27s/it]Train:   4%|▍         | 206/5000 [49:13<19:00:20, 14.27s/it]Train:   4%|▍         | 207/5000 [49:27<19:01:44, 14.29s/it]Train:   4%|▍         | 208/5000 [49:41<19:00:04, 14.27s/it]Train:   4%|▍         | 209/5000 [49:55<18:59:12, 14.27s/it]Train:   4%|▍         | 210/5000 [50:10<18:57:52, 14.25s/it]                                                            {'loss': 1.90621529, 'token_acc': 0.59126382, 'grad_norm': 0.31904176, 'learning_rate': 1.68e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069419, 'epoch': 0.04, 'global_step/max_steps': '210/5000', 'percentage': '4.20%', 'elapsed_time': '50m 10s', 'remaining_time': '19h 4m 20s'}
+Train:   4%|▍         | 210/5000 [50:10<18:57:52, 14.25s/it]Train:   4%|▍         | 210/5000 [50:10<18:57:52, 14.25s/it]Train:   4%|▍         | 211/5000 [50:24<18:56:56, 14.24s/it]Train:   4%|▍         | 212/5000 [50:38<18:57:50, 14.26s/it]Train:   4%|▍         | 213/5000 [50:52<18:57:23, 14.26s/it]Train:   4%|▍         | 214/5000 [51:07<18:57:06, 14.26s/it]Train:   4%|▍         | 215/5000 [51:21<18:56:13, 14.25s/it]Train:   4%|▍         | 216/5000 [51:35<18:56:10, 14.25s/it]Train:   4%|▍         | 217/5000 [51:49<18:55:28, 14.24s/it]Train:   4%|▍         | 218/5000 [52:04<18:56:25, 14.26s/it]Train:   4%|▍         | 219/5000 [52:18<18:56:11, 14.26s/it]Train:   4%|▍         | 220/5000 [52:32<18:56:31, 14.27s/it]                                                            {'loss': 1.905546, 'token_acc': 0.59493986, 'grad_norm': 0.30268887, 'learning_rate': 1.76e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069451, 'epoch': 0.04, 'global_step/max_steps': '220/5000', 'percentage': '4.40%', 'elapsed_time': '52m 32s', 'remaining_time': '19h 1m 40s'}
+Train:   4%|▍         | 220/5000 [52:32<18:56:31, 14.27s/it]Train:   4%|▍         | 220/5000 [52:32<18:56:31, 14.27s/it]Train:   4%|▍         | 221/5000 [52:47<18:56:15, 14.27s/it]Train:   4%|▍         | 222/5000 [53:01<18:54:59, 14.25s/it]Train:   4%|▍         | 223/5000 [53:15<18:55:34, 14.26s/it]Train:   4%|▍         | 224/5000 [53:29<18:54:41, 14.25s/it]Train:   4%|▍         | 225/5000 [53:44<18:55:29, 14.27s/it]Train:   5%|▍         | 226/5000 [53:58<18:56:28, 14.28s/it]Train:   5%|▍         | 227/5000 [54:12<18:55:54, 14.28s/it]Train:   5%|▍         | 228/5000 [54:26<18:55:05, 14.27s/it]Train:   5%|▍         | 229/5000 [54:41<18:55:45, 14.28s/it]Train:   5%|▍         | 230/5000 [54:55<18:54:45, 14.27s/it]                                                            {'loss': 1.90415268, 'token_acc': 0.5953011, 'grad_norm': 0.28634891, 'learning_rate': 1.84e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069478, 'epoch': 0.05, 'global_step/max_steps': '230/5000', 'percentage': '4.60%', 'elapsed_time': '54m 55s', 'remaining_time': '18h 59m 5s'}
+Train:   5%|▍         | 230/5000 [54:55<18:54:45, 14.27s/it]Train:   5%|▍         | 230/5000 [54:55<18:54:45, 14.27s/it]Train:   5%|▍         | 231/5000 [55:09<18:55:04, 14.28s/it]Train:   5%|▍         | 232/5000 [55:24<18:54:00, 14.27s/it]Train:   5%|▍         | 233/5000 [55:38<18:55:00, 14.29s/it]Train:   5%|▍         | 234/5000 [55:52<18:54:08, 14.28s/it]Train:   5%|▍         | 235/5000 [56:06<18:53:32, 14.27s/it]Train:   5%|▍         | 236/5000 [56:21<18:54:27, 14.29s/it]Train:   5%|▍         | 237/5000 [56:35<18:54:35, 14.29s/it]Train:   5%|▍         | 238/5000 [56:49<18:54:26, 14.29s/it]Train:   5%|▍         | 239/5000 [57:04<18:54:27, 14.30s/it]Train:   5%|▍         | 240/5000 [57:18<18:53:38, 14.29s/it]                                                            {'loss': 1.89853706, 'token_acc': 0.5949921, 'grad_norm': 0.30982289, 'learning_rate': 1.92e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069499, 'epoch': 0.05, 'global_step/max_steps': '240/5000', 'percentage': '4.80%', 'elapsed_time': '57m 18s', 'remaining_time': '18h 56m 33s'}
+Train:   5%|▍         | 240/5000 [57:18<18:53:38, 14.29s/it]Train:   5%|▍         | 240/5000 [57:18<18:53:38, 14.29s/it]Train:   5%|▍         | 241/5000 [57:32<18:53:26, 14.29s/it]Train:   5%|▍         | 242/5000 [57:46<18:53:30, 14.29s/it]Train:   5%|▍         | 243/5000 [58:01<18:53:08, 14.29s/it]Train:   5%|▍         | 244/5000 [58:15<18:53:19, 14.30s/it]Train:   5%|▍         | 245/5000 [58:29<18:54:02, 14.31s/it]Train:   5%|▍         | 246/5000 [58:44<18:53:53, 14.31s/it]Train:   5%|▍         | 247/5000 [58:58<18:53:36, 14.31s/it]Train:   5%|▍         | 248/5000 [59:12<18:52:44, 14.30s/it]Train:   5%|▍         | 249/5000 [59:27<18:51:57, 14.30s/it]Train:   5%|▌         | 250/5000 [59:41<18:51:38, 14.29s/it]                                                            {'loss': 1.8904171, 'token_acc': 0.59155839, 'grad_norm': 0.33320111, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069516, 'epoch': 0.05, 'global_step/max_steps': '250/5000', 'percentage': '5.00%', 'elapsed_time': '59m 41s', 'remaining_time': '18h 54m 5s'}
+Train:   5%|▌         | 250/5000 [59:41<18:51:38, 14.29s/it]Train:   5%|▌         | 250/5000 [59:41<18:51:38, 14.29s/it]Train:   5%|▌         | 251/5000 [59:55<18:52:12, 14.30s/it]Train:   5%|▌         | 252/5000 [1:00:09<18:52:00, 14.31s/it]Train:   5%|▌         | 253/5000 [1:00:24<18:51:38, 14.30s/it]Train:   5%|▌         | 254/5000 [1:00:38<18:51:40, 14.31s/it]Train:   5%|▌         | 255/5000 [1:00:52<18:51:27, 14.31s/it]Train:   5%|▌         | 256/5000 [1:01:07<18:50:58, 14.30s/it]Train:   5%|▌         | 257/5000 [1:01:21<18:49:36, 14.29s/it]Train:   5%|▌         | 258/5000 [1:01:35<18:49:19, 14.29s/it]Train:   5%|▌         | 259/5000 [1:01:50<18:48:36, 14.28s/it]Train:   5%|▌         | 260/5000 [1:02:04<18:48:14, 14.28s/it]                                                              {'loss': 1.8867527, 'token_acc': 0.59453495, 'grad_norm': 0.30961829, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069533, 'epoch': 0.05, 'global_step/max_steps': '260/5000', 'percentage': '5.20%', 'elapsed_time': '1h 2m 4s', 'remaining_time': '18h 51m 36s'}
+Train:   5%|▌         | 260/5000 [1:02:04<18:48:14, 14.28s/it]Train:   5%|▌         | 260/5000 [1:02:04<18:48:14, 14.28s/it]Train:   5%|▌         | 261/5000 [1:02:18<18:48:00, 14.28s/it]Train:   5%|▌         | 262/5000 [1:02:32<18:47:52, 14.28s/it]Train:   5%|▌         | 263/5000 [1:02:47<18:47:14, 14.28s/it]Train:   5%|▌         | 264/5000 [1:03:01<18:47:21, 14.28s/it]Train:   5%|▌         | 265/5000 [1:03:15<18:46:56, 14.28s/it]Train:   5%|▌         | 266/5000 [1:03:29<18:46:44, 14.28s/it]Train:   5%|▌         | 267/5000 [1:03:44<18:48:08, 14.30s/it]Train:   5%|▌         | 268/5000 [1:03:58<18:47:22, 14.29s/it]Train:   5%|▌         | 269/5000 [1:04:12<18:46:33, 14.29s/it]Train:   5%|▌         | 270/5000 [1:04:27<18:45:06, 14.27s/it]                                                              {'loss': 1.88856049, 'token_acc': 0.59468605, 'grad_norm': 0.29619628, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069551, 'epoch': 0.05, 'global_step/max_steps': '270/5000', 'percentage': '5.40%', 'elapsed_time': '1h 4m 27s', 'remaining_time': '18h 49m 6s'}
+Train:   5%|▌         | 270/5000 [1:04:27<18:45:06, 14.27s/it]Train:   5%|▌         | 270/5000 [1:04:27<18:45:06, 14.27s/it]Train:   5%|▌         | 271/5000 [1:04:41<18:44:45, 14.27s/it]Train:   5%|▌         | 272/5000 [1:04:55<18:44:27, 14.27s/it]Train:   5%|▌         | 273/5000 [1:05:09<18:44:06, 14.27s/it]Train:   5%|▌         | 274/5000 [1:05:24<18:43:38, 14.27s/it]Train:   6%|▌         | 275/5000 [1:05:38<18:42:20, 14.25s/it]Train:   6%|▌         | 276/5000 [1:05:52<18:43:58, 14.28s/it]Train:   6%|▌         | 277/5000 [1:06:07<18:44:18, 14.28s/it]Train:   6%|▌         | 278/5000 [1:06:21<18:44:31, 14.29s/it]Train:   6%|▌         | 279/5000 [1:06:35<18:43:59, 14.29s/it]Train:   6%|▌         | 280/5000 [1:06:49<18:43:22, 14.28s/it]                                                              {'loss': 1.88822556, 'token_acc': 0.60000131, 'grad_norm': 0.32128623, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069568, 'epoch': 0.06, 'global_step/max_steps': '280/5000', 'percentage': '5.60%', 'elapsed_time': '1h 6m 49s', 'remaining_time': '18h 46m 35s'}
+Train:   6%|▌         | 280/5000 [1:06:49<18:43:22, 14.28s/it]Train:   6%|▌         | 280/5000 [1:06:49<18:43:22, 14.28s/it]Train:   6%|▌         | 281/5000 [1:07:04<18:42:19, 14.27s/it]Train:   6%|▌         | 282/5000 [1:07:18<18:42:21, 14.27s/it]Train:   6%|▌         | 283/5000 [1:07:32<18:41:51, 14.27s/it]Train:   6%|▌         | 284/5000 [1:07:46<18:41:18, 14.27s/it]Train:   6%|▌         | 285/5000 [1:08:01<18:40:34, 14.26s/it]Train:   6%|▌         | 286/5000 [1:08:15<18:39:26, 14.25s/it]Train:   6%|▌         | 287/5000 [1:08:29<18:40:06, 14.26s/it]Train:   6%|▌         | 288/5000 [1:08:43<18:38:49, 14.25s/it]Train:   6%|▌         | 289/5000 [1:08:58<18:39:09, 14.25s/it]Train:   6%|▌         | 290/5000 [1:09:12<18:39:09, 14.26s/it]                                                              {'loss': 1.8941431, 'token_acc': 0.60058296, 'grad_norm': 0.3281253, 'learning_rate': 2e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069588, 'epoch': 0.06, 'global_step/max_steps': '290/5000', 'percentage': '5.80%', 'elapsed_time': '1h 9m 12s', 'remaining_time': '18h 44m 1s'}
+Train:   6%|▌         | 290/5000 [1:09:12<18:39:09, 14.26s/it]Train:   6%|▌         | 290/5000 [1:09:12<18:39:09, 14.26s/it]Train:   6%|▌         | 291/5000 [1:09:26<18:39:28, 14.26s/it]Train:   6%|▌         | 292/5000 [1:09:41<18:40:10, 14.28s/it]Train:   6%|▌         | 293/5000 [1:09:55<18:39:49, 14.27s/it]Train:   6%|▌         | 294/5000 [1:10:09<18:39:21, 14.27s/it]Train:   6%|▌         | 295/5000 [1:10:23<18:38:41, 14.27s/it]Train:   6%|▌         | 296/5000 [1:10:38<18:37:54, 14.26s/it]Train:   6%|▌         | 297/5000 [1:10:52<18:37:37, 14.26s/it]Train:   6%|▌         | 298/5000 [1:11:06<18:37:10, 14.26s/it]Train:   6%|▌         | 299/5000 [1:11:20<18:37:07, 14.26s/it]Train:   6%|▌         | 300/5000 [1:11:35<18:36:36, 14.25s/it]                                                              {'loss': 1.86357136, 'token_acc': 0.60353009, 'grad_norm': 0.28836361, 'learning_rate': 1.999e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069606, 'epoch': 0.06, 'global_step/max_steps': '300/5000', 'percentage': '6.00%', 'elapsed_time': '1h 11m 35s', 'remaining_time': '18h 41m 29s'}
+Train:   6%|▌         | 300/5000 [1:11:35<18:36:36, 14.25s/it]Train:   6%|▌         | 300/5000 [1:11:35<18:36:36, 14.25s/it]Train:   6%|▌         | 301/5000 [1:11:49<18:37:41, 14.27s/it]Train:   6%|▌         | 302/5000 [1:12:03<18:36:54, 14.26s/it]Train:   6%|▌         | 303/5000 [1:12:17<18:37:12, 14.27s/it]Train:   6%|▌         | 304/5000 [1:12:32<18:36:51, 14.27s/it]Train:   6%|▌         | 305/5000 [1:12:46<18:36:57, 14.27s/it]Train:   6%|▌         | 306/5000 [1:13:00<18:36:17, 14.27s/it]Train:   6%|▌         | 307/5000 [1:13:15<18:36:30, 14.27s/it]Train:   6%|▌         | 308/5000 [1:13:29<18:36:37, 14.28s/it]Train:   6%|▌         | 309/5000 [1:13:43<18:35:58, 14.27s/it]Train:   6%|▌         | 310/5000 [1:13:57<18:35:27, 14.27s/it]                                                              {'loss': 1.86665096, 'token_acc': 0.59896584, 'grad_norm': 0.30325133, 'learning_rate': 1.999e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.06962, 'epoch': 0.06, 'global_step/max_steps': '310/5000', 'percentage': '6.20%', 'elapsed_time': '1h 13m 57s', 'remaining_time': '18h 38m 59s'}
+Train:   6%|▌         | 310/5000 [1:13:57<18:35:27, 14.27s/it]Train:   6%|▌         | 310/5000 [1:13:57<18:35:27, 14.27s/it]Train:   6%|▌         | 311/5000 [1:14:12<18:34:31, 14.26s/it]Train:   6%|▌         | 312/5000 [1:14:26<18:33:08, 14.25s/it]Train:   6%|▋         | 313/5000 [1:14:40<18:32:21, 14.24s/it]Train:   6%|▋         | 314/5000 [1:14:54<18:33:07, 14.25s/it]Train:   6%|▋         | 315/5000 [1:15:09<18:32:33, 14.25s/it]Train:   6%|▋         | 316/5000 [1:15:23<18:33:03, 14.26s/it]Train:   6%|▋         | 317/5000 [1:15:37<18:32:24, 14.25s/it]Train:   6%|▋         | 318/5000 [1:15:51<18:31:45, 14.25s/it]Train:   6%|▋         | 319/5000 [1:16:06<18:31:58, 14.25s/it]Train:   6%|▋         | 320/5000 [1:16:20<18:32:22, 14.26s/it]                                                              {'loss': 1.87351952, 'token_acc': 0.60346977, 'grad_norm': 0.35297585, 'learning_rate': 1.999e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069637, 'epoch': 0.06, 'global_step/max_steps': '320/5000', 'percentage': '6.40%', 'elapsed_time': '1h 16m 20s', 'remaining_time': '18h 36m 27s'}
+Train:   6%|▋         | 320/5000 [1:16:20<18:32:22, 14.26s/it]Train:   6%|▋         | 320/5000 [1:16:20<18:32:22, 14.26s/it]Train:   6%|▋         | 321/5000 [1:16:34<18:31:41, 14.26s/it]Train:   6%|▋         | 322/5000 [1:16:48<18:33:01, 14.28s/it]Train:   6%|▋         | 323/5000 [1:17:03<18:32:21, 14.27s/it]Train:   6%|▋         | 324/5000 [1:17:17<18:31:09, 14.26s/it]Train:   6%|▋         | 325/5000 [1:17:31<18:31:26, 14.26s/it]Train:   7%|▋         | 326/5000 [1:17:45<18:31:09, 14.26s/it]Train:   7%|▋         | 327/5000 [1:18:00<18:30:40, 14.26s/it]Train:   7%|▋         | 328/5000 [1:18:14<18:30:31, 14.26s/it]Train:   7%|▋         | 329/5000 [1:18:28<18:30:27, 14.26s/it]Train:   7%|▋         | 330/5000 [1:18:42<18:30:53, 14.27s/it]                                                              {'loss': 1.86195011, 'token_acc': 0.6040661, 'grad_norm': 0.29450929, 'learning_rate': 1.999e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069651, 'epoch': 0.07, 'global_step/max_steps': '330/5000', 'percentage': '6.60%', 'elapsed_time': '1h 18m 42s', 'remaining_time': '18h 33m 57s'}
+Train:   7%|▋         | 330/5000 [1:18:42<18:30:53, 14.27s/it]Train:   7%|▋         | 330/5000 [1:18:42<18:30:53, 14.27s/it]Train:   7%|▋         | 331/5000 [1:18:57<18:29:34, 14.26s/it]Train:   7%|▋         | 332/5000 [1:19:11<18:30:18, 14.27s/it]Train:   7%|▋         | 333/5000 [1:19:25<18:31:04, 14.28s/it]Train:   7%|▋         | 334/5000 [1:19:40<18:30:43, 14.28s/it]Train:   7%|▋         | 335/5000 [1:19:54<18:30:12, 14.28s/it]Train:   7%|▋         | 336/5000 [1:20:08<18:29:05, 14.27s/it]Train:   7%|▋         | 337/5000 [1:20:22<18:28:39, 14.27s/it]Train:   7%|▋         | 338/5000 [1:20:37<18:29:13, 14.28s/it]Train:   7%|▋         | 339/5000 [1:20:51<18:28:57, 14.28s/it]Train:   7%|▋         | 340/5000 [1:21:05<18:28:05, 14.27s/it]                                                              {'loss': 1.86950493, 'token_acc': 0.60786045, 'grad_norm': 0.32259548, 'learning_rate': 1.998e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069663, 'epoch': 0.07, 'global_step/max_steps': '340/5000', 'percentage': '6.80%', 'elapsed_time': '1h 21m 5s', 'remaining_time': '18h 31m 28s'}
+Train:   7%|▋         | 340/5000 [1:21:05<18:28:05, 14.27s/it]Train:   7%|▋         | 340/5000 [1:21:05<18:28:05, 14.27s/it]Train:   7%|▋         | 341/5000 [1:21:20<18:28:39, 14.28s/it]Train:   7%|▋         | 342/5000 [1:21:34<18:27:46, 14.27s/it]Train:   7%|▋         | 343/5000 [1:21:48<18:29:44, 14.30s/it]Train:   7%|▋         | 344/5000 [1:22:02<18:28:06, 14.28s/it]Train:   7%|▋         | 345/5000 [1:22:17<18:28:53, 14.29s/it]Train:   7%|▋         | 346/5000 [1:22:31<18:29:40, 14.31s/it]Train:   7%|▋         | 347/5000 [1:22:45<18:29:31, 14.31s/it]Train:   7%|▋         | 348/5000 [1:23:00<18:29:18, 14.31s/it]Train:   7%|▋         | 349/5000 [1:23:14<18:28:18, 14.30s/it]Train:   7%|▋         | 350/5000 [1:23:28<18:26:54, 14.28s/it]                                                              {'loss': 1.87842102, 'token_acc': 0.59794457, 'grad_norm': 0.28930929, 'learning_rate': 1.998e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069671, 'epoch': 0.07, 'global_step/max_steps': '350/5000', 'percentage': '7.00%', 'elapsed_time': '1h 23m 28s', 'remaining_time': '18h 29m 3s'}
+Train:   7%|▋         | 350/5000 [1:23:28<18:26:54, 14.28s/it]Train:   7%|▋         | 350/5000 [1:23:28<18:26:54, 14.28s/it]Train:   7%|▋         | 351/5000 [1:23:42<18:26:21, 14.28s/it]Train:   7%|▋         | 352/5000 [1:23:57<18:26:21, 14.28s/it]Train:   7%|▋         | 353/5000 [1:24:11<18:25:55, 14.28s/it]Train:   7%|▋         | 354/5000 [1:24:25<18:26:49, 14.29s/it]Train:   7%|▋         | 355/5000 [1:24:40<18:25:31, 14.28s/it]Train:   7%|▋         | 356/5000 [1:24:54<18:24:50, 14.27s/it]Train:   7%|▋         | 357/5000 [1:25:08<18:25:11, 14.28s/it]Train:   7%|▋         | 358/5000 [1:25:22<18:24:46, 14.28s/it]Train:   7%|▋         | 359/5000 [1:25:37<18:25:01, 14.29s/it]Train:   7%|▋         | 360/5000 [1:25:51<18:24:33, 14.28s/it]                                                              {'loss': 1.85580444, 'token_acc': 0.60061019, 'grad_norm': 0.29744047, 'learning_rate': 1.997e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069681, 'epoch': 0.07, 'global_step/max_steps': '360/5000', 'percentage': '7.20%', 'elapsed_time': '1h 25m 51s', 'remaining_time': '18h 26m 36s'}
+Train:   7%|▋         | 360/5000 [1:25:51<18:24:33, 14.28s/it]Train:   7%|▋         | 360/5000 [1:25:51<18:24:33, 14.28s/it]Train:   7%|▋         | 361/5000 [1:26:05<18:25:25, 14.30s/it]Train:   7%|▋         | 362/5000 [1:26:20<18:24:22, 14.29s/it]Train:   7%|▋         | 363/5000 [1:26:34<18:24:27, 14.29s/it]Train:   7%|▋         | 364/5000 [1:26:48<18:23:26, 14.28s/it]Train:   7%|▋         | 365/5000 [1:27:02<18:23:41, 14.29s/it]Train:   7%|▋         | 366/5000 [1:27:17<18:24:18, 14.30s/it]Train:   7%|▋         | 367/5000 [1:27:31<18:24:53, 14.31s/it]Train:   7%|▋         | 368/5000 [1:27:45<18:23:34, 14.30s/it]Train:   7%|▋         | 369/5000 [1:28:00<18:23:53, 14.30s/it]Train:   7%|▋         | 370/5000 [1:28:14<18:22:25, 14.29s/it]                                                              {'loss': 1.85854855, 'token_acc': 0.60011763, 'grad_norm': 0.32163116, 'learning_rate': 1.997e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069688, 'epoch': 0.07, 'global_step/max_steps': '370/5000', 'percentage': '7.40%', 'elapsed_time': '1h 28m 14s', 'remaining_time': '18h 24m 11s'}
+Train:   7%|▋         | 370/5000 [1:28:14<18:22:25, 14.29s/it]Train:   7%|▋         | 370/5000 [1:28:14<18:22:25, 14.29s/it]Train:   7%|▋         | 371/5000 [1:28:28<18:22:29, 14.29s/it]Train:   7%|▋         | 372/5000 [1:28:42<18:21:45, 14.28s/it]Train:   7%|▋         | 373/5000 [1:28:57<18:20:57, 14.28s/it]Train:   7%|▋         | 374/5000 [1:29:11<18:21:13, 14.28s/it]Train:   8%|▊         | 375/5000 [1:29:25<18:20:19, 14.27s/it]Train:   8%|▊         | 376/5000 [1:29:40<18:19:55, 14.27s/it]Train:   8%|▊         | 377/5000 [1:29:54<18:18:30, 14.26s/it]Train:   8%|▊         | 378/5000 [1:30:08<18:17:52, 14.25s/it]Train:   8%|▊         | 379/5000 [1:30:22<18:17:25, 14.25s/it]Train:   8%|▊         | 380/5000 [1:30:37<18:17:30, 14.25s/it]                                                              {'loss': 1.86182747, 'token_acc': 0.59940626, 'grad_norm': 0.31187212, 'learning_rate': 1.996e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.0697, 'epoch': 0.08, 'global_step/max_steps': '380/5000', 'percentage': '7.60%', 'elapsed_time': '1h 30m 37s', 'remaining_time': '18h 21m 42s'}
+Train:   8%|▊         | 380/5000 [1:30:37<18:17:30, 14.25s/it]Train:   8%|▊         | 380/5000 [1:30:37<18:17:30, 14.25s/it]Train:   8%|▊         | 381/5000 [1:30:51<18:17:52, 14.26s/it]Train:   8%|▊         | 382/5000 [1:31:05<18:17:51, 14.26s/it]Train:   8%|▊         | 383/5000 [1:31:19<18:17:39, 14.26s/it]Train:   8%|▊         | 384/5000 [1:31:34<18:17:00, 14.26s/it]Train:   8%|▊         | 385/5000 [1:31:48<18:17:50, 14.27s/it]Train:   8%|▊         | 386/5000 [1:32:02<18:18:17, 14.28s/it]Train:   8%|▊         | 387/5000 [1:32:17<18:19:45, 14.30s/it]Train:   8%|▊         | 388/5000 [1:32:31<18:19:44, 14.31s/it]Train:   8%|▊         | 389/5000 [1:32:45<18:19:38, 14.31s/it]Train:   8%|▊         | 390/5000 [1:32:59<18:18:40, 14.30s/it]                                                              {'loss': 1.86076431, 'token_acc': 0.60066565, 'grad_norm': 0.28586408, 'learning_rate': 1.996e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069706, 'epoch': 0.08, 'global_step/max_steps': '390/5000', 'percentage': '7.80%', 'elapsed_time': '1h 32m 59s', 'remaining_time': '18h 19m 18s'}
+Train:   8%|▊         | 390/5000 [1:32:59<18:18:40, 14.30s/it]Train:   8%|▊         | 390/5000 [1:32:59<18:18:40, 14.30s/it]Train:   8%|▊         | 391/5000 [1:33:14<18:17:37, 14.29s/it]Train:   8%|▊         | 392/5000 [1:33:28<18:16:41, 14.28s/it]Train:   8%|▊         | 393/5000 [1:33:42<18:17:34, 14.29s/it]Train:   8%|▊         | 394/5000 [1:33:57<18:17:27, 14.30s/it]Train:   8%|▊         | 395/5000 [1:34:11<18:16:07, 14.28s/it]Train:   8%|▊         | 396/5000 [1:34:25<18:17:15, 14.30s/it]Train:   8%|▊         | 397/5000 [1:34:39<18:16:31, 14.29s/it]Train:   8%|▊         | 398/5000 [1:34:54<18:15:27, 14.28s/it]Train:   8%|▊         | 399/5000 [1:35:08<18:14:01, 14.27s/it]Train:   8%|▊         | 400/5000 [1:35:22<18:13:32, 14.26s/it]                                                              {'loss': 1.84686394, 'token_acc': 0.60087541, 'grad_norm': 0.28154442, 'learning_rate': 1.995e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069715, 'epoch': 0.08, 'global_step/max_steps': '400/5000', 'percentage': '8.00%', 'elapsed_time': '1h 35m 22s', 'remaining_time': '18h 16m 51s'}
+Train:   8%|▊         | 400/5000 [1:35:22<18:13:32, 14.26s/it]Train:   8%|▊         | 400/5000 [1:35:22<18:13:32, 14.26s/it]Train:   8%|▊         | 401/5000 [1:35:37<18:14:01, 14.27s/it]Train:   8%|▊         | 402/5000 [1:35:51<18:14:46, 14.29s/it]Train:   8%|▊         | 403/5000 [1:36:05<18:13:21, 14.27s/it]Train:   8%|▊         | 404/5000 [1:36:19<18:12:35, 14.26s/it]Train:   8%|▊         | 405/5000 [1:36:34<18:11:34, 14.25s/it]Train:   8%|▊         | 406/5000 [1:36:48<18:11:44, 14.26s/it]Train:   8%|▊         | 407/5000 [1:37:02<18:12:10, 14.27s/it]Train:   8%|▊         | 408/5000 [1:37:16<18:11:10, 14.26s/it]Train:   8%|▊         | 409/5000 [1:37:31<18:10:59, 14.26s/it]Train:   8%|▊         | 410/5000 [1:37:45<18:10:44, 14.26s/it]                                                              {'loss': 1.83447456, 'token_acc': 0.61304684, 'grad_norm': 0.28617385, 'learning_rate': 1.994e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069724, 'epoch': 0.08, 'global_step/max_steps': '410/5000', 'percentage': '8.20%', 'elapsed_time': '1h 37m 45s', 'remaining_time': '18h 14m 23s'}
+Train:   8%|▊         | 410/5000 [1:37:45<18:10:44, 14.26s/it]Train:   8%|▊         | 410/5000 [1:37:45<18:10:44, 14.26s/it]Train:   8%|▊         | 411/5000 [1:37:59<18:10:20, 14.26s/it]Train:   8%|▊         | 412/5000 [1:38:13<18:10:23, 14.26s/it]Train:   8%|▊         | 413/5000 [1:38:28<18:09:40, 14.25s/it]Train:   8%|▊         | 414/5000 [1:38:42<18:09:14, 14.25s/it]Train:   8%|▊         | 415/5000 [1:38:56<18:10:02, 14.26s/it]Train:   8%|▊         | 416/5000 [1:39:10<18:09:33, 14.26s/it]Train:   8%|▊         | 417/5000 [1:39:25<18:09:25, 14.26s/it]Train:   8%|▊         | 418/5000 [1:39:39<18:08:13, 14.25s/it]Train:   8%|▊         | 419/5000 [1:39:53<18:09:13, 14.27s/it]Train:   8%|▊         | 420/5000 [1:40:07<18:08:25, 14.26s/it]                                                              {'loss': 1.83436584, 'token_acc': 0.6025662, 'grad_norm': 0.28418577, 'learning_rate': 1.994e-05, 'memory(GiB)': 126.44, 'train_speed(iter/s)': 0.069734, 'epoch': 0.08, 'global_step/max_steps': '420/5000', 'percentage': '8.40%', 'elapsed_time': '1h 40m 7s', 'remaining_time': '18h 11m 55s'}
+Train:   8%|▊         | 420/5000 [1:40:07<18:08:25, 14.26s/it]Train:   8%|▊         | 420/5000 [1:40:07<18:08:25, 14.26s/it]Train:   8%|▊         | 421/5000 [1:40:22<18:08:13, 14.26s/it]Train:   8%|▊         | 422/5000 [1:40:36<18:07:35, 14.25s/it]Train:   8%|▊         | 423/5000 [1:40:50<18:07:42, 14.26s/it]Train:   8%|▊         | 424/5000 [1:41:04<18:07:35, 14.26s/it]Train:   8%|▊         | 425/5000 [1:41:19<18:06:48, 14.25s/it]Train:   9%|▊         | 426/5000 [1:41:33<18:06:22, 14.25s/it]Train:   9%|▊         | 427/5000 [1:41:47<18:06:06, 14.25s/it]Train:   9%|▊         | 428/5000 [1:42:02<18:06:46, 14.26s/it]Train:   9%|▊         | 429/5000 [1:42:16<18:07:06, 14.27s/it]Train:   9%|▊         | 430/5000 [1:42:30<18:06:31, 14.27s/it]                                                              {'loss': 1.85139008, 'token_acc': 0.60365973, 'grad_norm': 0.27708828, 'learning_rate': 1.993e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069743, 'epoch': 0.09, 'global_step/max_steps': '430/5000', 'percentage': '8.60%', 'elapsed_time': '1h 42m 30s', 'remaining_time': '18h 9m 27s'}
+Train:   9%|▊         | 430/5000 [1:42:30<18:06:31, 14.27s/it]Train:   9%|▊         | 430/5000 [1:42:30<18:06:31, 14.27s/it]Train:   9%|▊         | 431/5000 [1:42:44<18:06:08, 14.26s/it]Train:   9%|▊         | 432/5000 [1:42:59<18:05:53, 14.26s/it]Train:   9%|▊         | 433/5000 [1:43:13<18:05:52, 14.27s/it]Train:   9%|▊         | 434/5000 [1:43:27<18:06:25, 14.28s/it]Train:   9%|▊         | 435/5000 [1:43:41<18:05:07, 14.26s/it]Train:   9%|▊         | 436/5000 [1:43:56<18:04:40, 14.26s/it]Train:   9%|▊         | 437/5000 [1:44:10<18:05:19, 14.27s/it]Train:   9%|▉         | 438/5000 [1:44:24<18:04:37, 14.27s/it]Train:   9%|▉         | 439/5000 [1:44:38<18:04:43, 14.27s/it]Train:   9%|▉         | 440/5000 [1:44:53<18:04:55, 14.28s/it]                                                              {'loss': 1.83720894, 'token_acc': 0.61009955, 'grad_norm': 0.27546084, 'learning_rate': 1.992e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069751, 'epoch': 0.09, 'global_step/max_steps': '440/5000', 'percentage': '8.80%', 'elapsed_time': '1h 44m 53s', 'remaining_time': '18h 7m 0s'}
+Train:   9%|▉         | 440/5000 [1:44:53<18:04:55, 14.28s/it]Train:   9%|▉         | 440/5000 [1:44:53<18:04:55, 14.28s/it]Train:   9%|▉         | 441/5000 [1:45:07<18:04:52, 14.28s/it]Train:   9%|▉         | 442/5000 [1:45:21<18:04:05, 14.27s/it]Train:   9%|▉         | 443/5000 [1:45:36<18:03:18, 14.26s/it]Train:   9%|▉         | 444/5000 [1:45:50<18:02:57, 14.26s/it]Train:   9%|▉         | 445/5000 [1:46:04<18:03:16, 14.27s/it]Train:   9%|▉         | 446/5000 [1:46:18<18:03:28, 14.28s/it]Train:   9%|▉         | 447/5000 [1:46:33<18:03:44, 14.28s/it]Train:   9%|▉         | 448/5000 [1:46:47<18:04:15, 14.29s/it]Train:   9%|▉         | 449/5000 [1:47:01<18:02:38, 14.27s/it]Train:   9%|▉         | 450/5000 [1:47:15<18:02:37, 14.28s/it]                                                              {'loss': 1.84299126, 'token_acc': 0.60926711, 'grad_norm': 0.27760604, 'learning_rate': 1.991e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069757, 'epoch': 0.09, 'global_step/max_steps': '450/5000', 'percentage': '9.00%', 'elapsed_time': '1h 47m 15s', 'remaining_time': '18h 4m 34s'}
+Train:   9%|▉         | 450/5000 [1:47:15<18:02:37, 14.28s/it]Train:   9%|▉         | 450/5000 [1:47:15<18:02:37, 14.28s/it]Train:   9%|▉         | 451/5000 [1:47:30<18:02:29, 14.28s/it]Train:   9%|▉         | 452/5000 [1:47:44<18:01:12, 14.26s/it]Train:   9%|▉         | 453/5000 [1:47:58<18:00:20, 14.26s/it]Train:   9%|▉         | 454/5000 [1:48:13<18:00:29, 14.26s/it]Train:   9%|▉         | 455/5000 [1:48:27<18:04:22, 14.32s/it]Train:   9%|▉         | 456/5000 [1:48:41<18:03:25, 14.31s/it]Train:   9%|▉         | 457/5000 [1:48:56<18:02:29, 14.30s/it]Train:   9%|▉         | 458/5000 [1:49:10<18:01:25, 14.29s/it]Train:   9%|▉         | 459/5000 [1:49:24<18:02:36, 14.30s/it]Train:   9%|▉         | 460/5000 [1:49:38<18:03:03, 14.31s/it]                                                              {'loss': 1.83453865, 'token_acc': 0.60485763, 'grad_norm': 0.27531469, 'learning_rate': 1.99e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069762, 'epoch': 0.09, 'global_step/max_steps': '460/5000', 'percentage': '9.20%', 'elapsed_time': '1h 49m 38s', 'remaining_time': '18h 2m 11s'}
+Train:   9%|▉         | 460/5000 [1:49:38<18:03:03, 14.31s/it]Train:   9%|▉         | 460/5000 [1:49:38<18:03:03, 14.31s/it]Train:   9%|▉         | 461/5000 [1:49:53<18:01:45, 14.30s/it]Train:   9%|▉         | 462/5000 [1:50:07<18:02:04, 14.31s/it]Train:   9%|▉         | 463/5000 [1:50:21<18:01:56, 14.31s/it]Train:   9%|▉         | 464/5000 [1:50:36<18:01:40, 14.31s/it]Train:   9%|▉         | 465/5000 [1:50:50<18:01:04, 14.30s/it]Train:   9%|▉         | 466/5000 [1:51:04<18:00:23, 14.30s/it]Train:   9%|▉         | 467/5000 [1:51:19<18:01:03, 14.31s/it]Train:   9%|▉         | 468/5000 [1:51:33<17:59:36, 14.29s/it]Train:   9%|▉         | 469/5000 [1:51:47<17:59:01, 14.29s/it]Train:   9%|▉         | 470/5000 [1:52:01<17:59:05, 14.29s/it]                                                              {'loss': 1.83594742, 'token_acc': 0.59933347, 'grad_norm': 0.28268778, 'learning_rate': 1.989e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069766, 'epoch': 0.09, 'global_step/max_steps': '470/5000', 'percentage': '9.40%', 'elapsed_time': '1h 52m 1s', 'remaining_time': '17h 59m 47s'}
+Train:   9%|▉         | 470/5000 [1:52:01<17:59:05, 14.29s/it]Train:   9%|▉         | 470/5000 [1:52:01<17:59:05, 14.29s/it]Train:   9%|▉         | 471/5000 [1:52:16<17:58:50, 14.29s/it]Train:   9%|▉         | 472/5000 [1:52:30<17:57:11, 14.27s/it]Train:   9%|▉         | 473/5000 [1:52:44<17:57:07, 14.28s/it]Train:   9%|▉         | 474/5000 [1:52:58<17:56:51, 14.28s/it]Train:  10%|▉         | 475/5000 [1:53:13<17:57:04, 14.28s/it]Train:  10%|▉         | 476/5000 [1:53:27<17:56:52, 14.28s/it]Train:  10%|▉         | 477/5000 [1:53:41<17:56:13, 14.28s/it]Train:  10%|▉         | 478/5000 [1:53:56<17:57:06, 14.29s/it]Train:  10%|▉         | 479/5000 [1:54:10<17:57:34, 14.30s/it]Train:  10%|▉         | 480/5000 [1:54:24<17:57:03, 14.30s/it]                                                              {'loss': 1.83839016, 'token_acc': 0.60039158, 'grad_norm': 0.26886845, 'learning_rate': 1.988e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06977, 'epoch': 0.1, 'global_step/max_steps': '480/5000', 'percentage': '9.60%', 'elapsed_time': '1h 54m 24s', 'remaining_time': '17h 57m 23s'}
+Train:  10%|▉         | 480/5000 [1:54:24<17:57:03, 14.30s/it]Train:  10%|▉         | 480/5000 [1:54:24<17:57:03, 14.30s/it]Train:  10%|▉         | 481/5000 [1:54:39<17:56:00, 14.29s/it]Train:  10%|▉         | 482/5000 [1:54:53<17:56:08, 14.29s/it]Train:  10%|▉         | 483/5000 [1:55:07<17:56:38, 14.30s/it]Train:  10%|▉         | 484/5000 [1:55:21<17:56:15, 14.30s/it]Train:  10%|▉         | 485/5000 [1:55:36<17:55:33, 14.29s/it]Train:  10%|▉         | 486/5000 [1:55:50<17:54:55, 14.29s/it]Train:  10%|▉         | 487/5000 [1:56:04<17:55:14, 14.30s/it]Train:  10%|▉         | 488/5000 [1:56:19<17:54:20, 14.29s/it]Train:  10%|▉         | 489/5000 [1:56:33<17:54:16, 14.29s/it]Train:  10%|▉         | 490/5000 [1:56:47<17:53:52, 14.29s/it]                                                              {'loss': 1.83234673, 'token_acc': 0.60172976, 'grad_norm': 0.27367273, 'learning_rate': 1.987e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069775, 'epoch': 0.1, 'global_step/max_steps': '490/5000', 'percentage': '9.80%', 'elapsed_time': '1h 56m 47s', 'remaining_time': '17h 54m 59s'}
+Train:  10%|▉         | 490/5000 [1:56:47<17:53:52, 14.29s/it]Train:  10%|▉         | 490/5000 [1:56:47<17:53:52, 14.29s/it]Train:  10%|▉         | 491/5000 [1:57:01<17:54:35, 14.30s/it]Train:  10%|▉         | 492/5000 [1:57:16<17:53:46, 14.29s/it]Train:  10%|▉         | 493/5000 [1:57:30<17:53:14, 14.29s/it]Train:  10%|▉         | 494/5000 [1:57:44<17:53:56, 14.30s/it]Train:  10%|▉         | 495/5000 [1:57:59<17:53:56, 14.30s/it]Train:  10%|▉         | 496/5000 [1:58:13<17:53:44, 14.30s/it]Train:  10%|▉         | 497/5000 [1:58:27<17:53:56, 14.31s/it]Train:  10%|▉         | 498/5000 [1:58:42<17:54:08, 14.32s/it]Train:  10%|▉         | 499/5000 [1:58:56<17:53:55, 14.32s/it]Train:  10%|█         | 500/5000 [1:59:10<17:53:41, 14.32s/it]                                                              {'loss': 1.81960526, 'token_acc': 0.60566657, 'grad_norm': 0.26596782, 'learning_rate': 1.986e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069777, 'epoch': 0.1, 'global_step/max_steps': '500/5000', 'percentage': '10.00%', 'elapsed_time': '1h 59m 10s', 'remaining_time': '17h 52m 36s'}
+Train:  10%|█         | 500/5000 [1:59:10<17:53:41, 14.32s/it]Train:  10%|█         | 500/5000 [1:59:10<17:53:41, 14.32s/it]Train:  10%|█         | 501/5000 [1:59:25<17:53:29, 14.32s/it]Train:  10%|█         | 502/5000 [1:59:39<17:53:26, 14.32s/it]Train:  10%|█         | 503/5000 [1:59:53<17:52:37, 14.31s/it]Train:  10%|█         | 504/5000 [2:00:07<17:51:32, 14.30s/it]Train:  10%|█         | 505/5000 [2:00:22<17:51:12, 14.30s/it]Train:  10%|█         | 506/5000 [2:00:36<17:50:33, 14.29s/it]Train:  10%|█         | 507/5000 [2:00:50<17:50:24, 14.29s/it]Train:  10%|█         | 508/5000 [2:01:05<17:50:03, 14.29s/it]Train:  10%|█         | 509/5000 [2:01:19<17:49:17, 14.29s/it]Train:  10%|█         | 510/5000 [2:01:33<17:48:58, 14.28s/it]                                                              {'loss': 1.83114777, 'token_acc': 0.59784336, 'grad_norm': 0.27380797, 'learning_rate': 1.985e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069781, 'epoch': 0.1, 'global_step/max_steps': '510/5000', 'percentage': '10.20%', 'elapsed_time': '2h 1m 33s', 'remaining_time': '17h 50m 13s'}
+Train:  10%|█         | 510/5000 [2:01:33<17:48:58, 14.28s/it]Train:  10%|█         | 510/5000 [2:01:33<17:48:58, 14.28s/it]Train:  10%|█         | 511/5000 [2:01:47<17:48:57, 14.29s/it]Train:  10%|█         | 512/5000 [2:02:02<17:47:52, 14.28s/it]Train:  10%|█         | 513/5000 [2:02:16<17:47:44, 14.28s/it]Train:  10%|█         | 514/5000 [2:02:30<17:46:56, 14.27s/it]Train:  10%|█         | 515/5000 [2:02:45<17:47:04, 14.28s/it]Train:  10%|█         | 516/5000 [2:02:59<17:47:01, 14.28s/it]Train:  10%|█         | 517/5000 [2:03:13<17:48:09, 14.30s/it]Train:  10%|█         | 518/5000 [2:03:27<17:47:47, 14.29s/it]Train:  10%|█         | 519/5000 [2:03:42<17:47:59, 14.30s/it]Train:  10%|█         | 520/5000 [2:03:56<17:47:47, 14.30s/it]                                                              {'loss': 1.83192215, 'token_acc': 0.60483506, 'grad_norm': 0.29300642, 'learning_rate': 1.984e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069784, 'epoch': 0.1, 'global_step/max_steps': '520/5000', 'percentage': '10.40%', 'elapsed_time': '2h 3m 56s', 'remaining_time': '17h 47m 49s'}
+Train:  10%|█         | 520/5000 [2:03:56<17:47:47, 14.30s/it]Train:  10%|█         | 520/5000 [2:03:56<17:47:47, 14.30s/it]Train:  10%|█         | 521/5000 [2:04:10<17:47:14, 14.30s/it]Train:  10%|█         | 522/5000 [2:04:25<17:46:21, 14.29s/it]Train:  10%|█         | 523/5000 [2:04:39<17:45:32, 14.28s/it]Train:  10%|█         | 524/5000 [2:04:53<17:44:45, 14.27s/it]Train:  10%|█         | 525/5000 [2:05:07<17:44:07, 14.27s/it]Train:  11%|█         | 526/5000 [2:05:22<17:44:08, 14.27s/it]Train:  11%|█         | 527/5000 [2:05:36<17:44:39, 14.28s/it]Train:  11%|█         | 528/5000 [2:05:50<17:43:59, 14.28s/it]Train:  11%|█         | 529/5000 [2:06:05<17:44:02, 14.28s/it]Train:  11%|█         | 530/5000 [2:06:19<17:42:07, 14.26s/it]                                                              {'loss': 1.83574581, 'token_acc': 0.60336655, 'grad_norm': 0.29365262, 'learning_rate': 1.983e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06979, 'epoch': 0.11, 'global_step/max_steps': '530/5000', 'percentage': '10.60%', 'elapsed_time': '2h 6m 19s', 'remaining_time': '17h 45m 23s'}
+Train:  11%|█         | 530/5000 [2:06:19<17:42:07, 14.26s/it]Train:  11%|█         | 530/5000 [2:06:19<17:42:07, 14.26s/it]Train:  11%|█         | 531/5000 [2:06:33<17:42:03, 14.26s/it]Train:  11%|█         | 532/5000 [2:06:47<17:42:29, 14.27s/it]Train:  11%|█         | 533/5000 [2:07:02<17:43:03, 14.28s/it]Train:  11%|█         | 534/5000 [2:07:16<17:43:07, 14.28s/it]Train:  11%|█         | 535/5000 [2:07:30<17:42:09, 14.27s/it]Train:  11%|█         | 536/5000 [2:07:44<17:42:04, 14.28s/it]Train:  11%|█         | 537/5000 [2:07:59<17:40:39, 14.26s/it]Train:  11%|█         | 538/5000 [2:08:13<17:41:03, 14.27s/it]Train:  11%|█         | 539/5000 [2:08:27<17:40:34, 14.26s/it]Train:  11%|█         | 540/5000 [2:08:41<17:40:05, 14.26s/it]                                                              {'loss': 1.82404842, 'token_acc': 0.61051699, 'grad_norm': 0.27243608, 'learning_rate': 1.982e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069795, 'epoch': 0.11, 'global_step/max_steps': '540/5000', 'percentage': '10.80%', 'elapsed_time': '2h 8m 41s', 'remaining_time': '17h 42m 57s'}
+Train:  11%|█         | 540/5000 [2:08:41<17:40:05, 14.26s/it]Train:  11%|█         | 540/5000 [2:08:41<17:40:05, 14.26s/it]Train:  11%|█         | 541/5000 [2:08:56<17:40:26, 14.27s/it]Train:  11%|█         | 542/5000 [2:09:10<17:41:34, 14.29s/it]Train:  11%|█         | 543/5000 [2:09:24<17:41:07, 14.28s/it]Train:  11%|█         | 544/5000 [2:09:39<17:40:19, 14.28s/it]Train:  11%|█         | 545/5000 [2:09:53<17:40:15, 14.28s/it]Train:  11%|█         | 546/5000 [2:10:07<17:41:20, 14.30s/it]Train:  11%|█         | 547/5000 [2:10:22<17:40:36, 14.29s/it]Train:  11%|█         | 548/5000 [2:10:36<17:40:36, 14.29s/it]Train:  11%|█         | 549/5000 [2:10:50<17:39:50, 14.29s/it]Train:  11%|█         | 550/5000 [2:11:04<17:39:46, 14.29s/it]                                                              {'loss': 1.82485981, 'token_acc': 0.60724357, 'grad_norm': 0.27438956, 'learning_rate': 1.98e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069799, 'epoch': 0.11, 'global_step/max_steps': '550/5000', 'percentage': '11.00%', 'elapsed_time': '2h 11m 4s', 'remaining_time': '17h 40m 34s'}
+Train:  11%|█         | 550/5000 [2:11:04<17:39:46, 14.29s/it]Train:  11%|█         | 550/5000 [2:11:04<17:39:46, 14.29s/it]Train:  11%|█         | 551/5000 [2:11:19<17:40:13, 14.30s/it]Train:  11%|█         | 552/5000 [2:11:33<17:39:52, 14.30s/it]Train:  11%|█         | 553/5000 [2:11:47<17:39:21, 14.29s/it]Train:  11%|█         | 554/5000 [2:12:02<17:39:24, 14.30s/it]Train:  11%|█         | 555/5000 [2:12:16<17:39:00, 14.29s/it]Train:  11%|█         | 556/5000 [2:12:30<17:38:30, 14.29s/it]Train:  11%|█         | 557/5000 [2:12:44<17:37:14, 14.28s/it]Train:  11%|█         | 558/5000 [2:12:59<17:37:51, 14.29s/it]Train:  11%|█         | 559/5000 [2:13:13<17:38:21, 14.30s/it]Train:  11%|█         | 560/5000 [2:13:27<17:37:49, 14.29s/it]                                                              {'loss': 1.80996456, 'token_acc': 0.61021513, 'grad_norm': 0.25922588, 'learning_rate': 1.979e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069801, 'epoch': 0.11, 'global_step/max_steps': '560/5000', 'percentage': '11.20%', 'elapsed_time': '2h 13m 27s', 'remaining_time': '17h 38m 10s'}
+Train:  11%|█         | 560/5000 [2:13:27<17:37:49, 14.29s/it]Train:  11%|█         | 560/5000 [2:13:27<17:37:49, 14.29s/it]Train:  11%|█         | 561/5000 [2:13:42<17:37:23, 14.29s/it]Train:  11%|█         | 562/5000 [2:13:56<17:36:57, 14.29s/it]Train:  11%|█▏        | 563/5000 [2:14:10<17:36:13, 14.28s/it]Train:  11%|█▏        | 564/5000 [2:14:24<17:35:38, 14.28s/it]Train:  11%|█▏        | 565/5000 [2:14:39<17:34:49, 14.27s/it]Train:  11%|█▏        | 566/5000 [2:14:53<17:34:47, 14.27s/it]Train:  11%|█▏        | 567/5000 [2:15:07<17:34:08, 14.27s/it]Train:  11%|█▏        | 568/5000 [2:15:21<17:34:09, 14.27s/it]Train:  11%|█▏        | 569/5000 [2:15:36<17:34:54, 14.28s/it]Train:  11%|█▏        | 570/5000 [2:15:50<17:33:50, 14.27s/it]                                                              {'loss': 1.80984459, 'token_acc': 0.606045, 'grad_norm': 0.28738451, 'learning_rate': 1.978e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069806, 'epoch': 0.11, 'global_step/max_steps': '570/5000', 'percentage': '11.40%', 'elapsed_time': '2h 15m 50s', 'remaining_time': '17h 35m 45s'}
+Train:  11%|█▏        | 570/5000 [2:15:50<17:33:50, 14.27s/it]Train:  11%|█▏        | 570/5000 [2:15:50<17:33:50, 14.27s/it]Train:  11%|█▏        | 571/5000 [2:16:04<17:33:13, 14.27s/it]Train:  11%|█▏        | 572/5000 [2:16:19<17:34:11, 14.28s/it]Train:  11%|█▏        | 573/5000 [2:16:33<17:33:24, 14.28s/it]Train:  11%|█▏        | 574/5000 [2:16:47<17:33:11, 14.28s/it]Train:  12%|█▏        | 575/5000 [2:17:01<17:32:31, 14.27s/it]Train:  12%|█▏        | 576/5000 [2:17:16<17:32:31, 14.27s/it]Train:  12%|█▏        | 577/5000 [2:17:30<17:32:10, 14.27s/it]Train:  12%|█▏        | 578/5000 [2:17:44<17:32:18, 14.28s/it]Train:  12%|█▏        | 579/5000 [2:17:59<17:32:07, 14.28s/it]Train:  12%|█▏        | 580/5000 [2:18:13<17:31:32, 14.27s/it]                                                              {'loss': 1.81860847, 'token_acc': 0.61123495, 'grad_norm': 0.27744946, 'learning_rate': 1.976e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06981, 'epoch': 0.12, 'global_step/max_steps': '580/5000', 'percentage': '11.60%', 'elapsed_time': '2h 18m 13s', 'remaining_time': '17h 33m 20s'}
+Train:  12%|█▏        | 580/5000 [2:18:13<17:31:32, 14.27s/it]Train:  12%|█▏        | 580/5000 [2:18:13<17:31:32, 14.27s/it]Train:  12%|█▏        | 581/5000 [2:18:27<17:31:04, 14.27s/it]Train:  12%|█▏        | 582/5000 [2:18:41<17:30:35, 14.27s/it]Train:  12%|█▏        | 583/5000 [2:18:56<17:30:38, 14.27s/it]Train:  12%|█▏        | 584/5000 [2:19:10<17:30:17, 14.27s/it]Train:  12%|█▏        | 585/5000 [2:19:24<17:29:20, 14.26s/it]Train:  12%|█▏        | 586/5000 [2:19:38<17:28:49, 14.26s/it]Train:  12%|█▏        | 587/5000 [2:19:53<17:29:23, 14.27s/it]Train:  12%|█▏        | 588/5000 [2:20:07<17:29:29, 14.27s/it]Train:  12%|█▏        | 589/5000 [2:20:21<17:30:01, 14.28s/it]Train:  12%|█▏        | 590/5000 [2:20:36<17:29:14, 14.28s/it]                                                              {'loss': 1.80693207, 'token_acc': 0.6066663, 'grad_norm': 0.28573743, 'learning_rate': 1.975e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069815, 'epoch': 0.12, 'global_step/max_steps': '590/5000', 'percentage': '11.80%', 'elapsed_time': '2h 20m 36s', 'remaining_time': '17h 30m 55s'}
+Train:  12%|█▏        | 590/5000 [2:20:36<17:29:14, 14.28s/it]Train:  12%|█▏        | 590/5000 [2:20:36<17:29:14, 14.28s/it]Train:  12%|█▏        | 591/5000 [2:20:50<17:28:37, 14.27s/it]Train:  12%|█▏        | 592/5000 [2:21:04<17:27:51, 14.26s/it]Train:  12%|█▏        | 593/5000 [2:21:18<17:28:26, 14.27s/it]Train:  12%|█▏        | 594/5000 [2:21:33<17:28:46, 14.28s/it]Train:  12%|█▏        | 595/5000 [2:21:47<17:28:15, 14.28s/it]Train:  12%|█▏        | 596/5000 [2:22:01<17:27:44, 14.27s/it]Train:  12%|█▏        | 597/5000 [2:22:15<17:27:08, 14.27s/it]Train:  12%|█▏        | 598/5000 [2:22:30<17:27:00, 14.27s/it]Train:  12%|█▏        | 599/5000 [2:22:44<17:26:49, 14.27s/it]Train:  12%|█▏        | 600/5000 [2:22:58<17:26:50, 14.28s/it]                                                              {'loss': 1.80462494, 'token_acc': 0.60552235, 'grad_norm': 0.25877085, 'learning_rate': 1.973e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069819, 'epoch': 0.12, 'global_step/max_steps': '600/5000', 'percentage': '12.00%', 'elapsed_time': '2h 22m 58s', 'remaining_time': '17h 28m 30s'}
+Train:  12%|█▏        | 600/5000 [2:22:58<17:26:50, 14.28s/it]Train:  12%|█▏        | 600/5000 [2:22:58<17:26:50, 14.28s/it]Train:  12%|█▏        | 601/5000 [2:23:13<17:26:56, 14.28s/it]Train:  12%|█▏        | 602/5000 [2:23:27<17:25:59, 14.27s/it]Train:  12%|█▏        | 603/5000 [2:23:41<17:26:13, 14.28s/it]Train:  12%|█▏        | 604/5000 [2:23:55<17:26:15, 14.28s/it]Train:  12%|█▏        | 605/5000 [2:24:10<17:25:33, 14.27s/it]Train:  12%|█▏        | 606/5000 [2:24:24<17:25:28, 14.28s/it]Train:  12%|█▏        | 607/5000 [2:24:38<17:25:14, 14.28s/it]Train:  12%|█▏        | 608/5000 [2:24:52<17:25:51, 14.29s/it]Train:  12%|█▏        | 609/5000 [2:25:07<17:26:00, 14.29s/it]Train:  12%|█▏        | 610/5000 [2:25:21<17:26:28, 14.30s/it]                                                              {'loss': 1.81808891, 'token_acc': 0.60900835, 'grad_norm': 0.26022747, 'learning_rate': 1.972e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069822, 'epoch': 0.12, 'global_step/max_steps': '610/5000', 'percentage': '12.20%', 'elapsed_time': '2h 25m 21s', 'remaining_time': '17h 26m 7s'}
+Train:  12%|█▏        | 610/5000 [2:25:21<17:26:28, 14.30s/it]Train:  12%|█▏        | 610/5000 [2:25:21<17:26:28, 14.30s/it]Train:  12%|█▏        | 611/5000 [2:25:35<17:26:02, 14.30s/it]Train:  12%|█▏        | 612/5000 [2:25:50<17:25:13, 14.29s/it]Train:  12%|█▏        | 613/5000 [2:26:04<17:24:54, 14.29s/it]Train:  12%|█▏        | 614/5000 [2:26:18<17:25:01, 14.30s/it]Train:  12%|█▏        | 615/5000 [2:26:33<17:24:32, 14.29s/it]Train:  12%|█▏        | 616/5000 [2:26:47<17:25:06, 14.30s/it]Train:  12%|█▏        | 617/5000 [2:27:01<17:24:27, 14.30s/it]Train:  12%|█▏        | 618/5000 [2:27:15<17:23:56, 14.29s/it]Train:  12%|█▏        | 619/5000 [2:27:30<17:23:22, 14.29s/it]Train:  12%|█▏        | 620/5000 [2:27:44<17:23:49, 14.30s/it]                                                              {'loss': 1.80496483, 'token_acc': 0.61133276, 'grad_norm': 0.28799546, 'learning_rate': 1.97e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069824, 'epoch': 0.12, 'global_step/max_steps': '620/5000', 'percentage': '12.40%', 'elapsed_time': '2h 27m 44s', 'remaining_time': '17h 23m 43s'}
+Train:  12%|█▏        | 620/5000 [2:27:44<17:23:49, 14.30s/it]Train:  12%|█▏        | 620/5000 [2:27:44<17:23:49, 14.30s/it]Train:  12%|█▏        | 621/5000 [2:27:58<17:23:45, 14.30s/it]Train:  12%|█▏        | 622/5000 [2:28:13<17:24:06, 14.31s/it]Train:  12%|█▏        | 623/5000 [2:28:27<17:23:46, 14.31s/it]Train:  12%|█▏        | 624/5000 [2:28:41<17:24:04, 14.32s/it]Train:  12%|█▎        | 625/5000 [2:28:56<17:22:26, 14.30s/it]Train:  13%|█▎        | 626/5000 [2:29:10<17:21:55, 14.29s/it]Train:  13%|█▎        | 627/5000 [2:29:24<17:21:48, 14.29s/it]Train:  13%|█▎        | 628/5000 [2:29:38<17:21:04, 14.29s/it]Train:  13%|█▎        | 629/5000 [2:29:53<17:20:29, 14.28s/it]Train:  13%|█▎        | 630/5000 [2:30:07<17:19:42, 14.28s/it]                                                              {'loss': 1.81401138, 'token_acc': 0.60446631, 'grad_norm': 0.27636328, 'learning_rate': 1.969e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069826, 'epoch': 0.13, 'global_step/max_steps': '630/5000', 'percentage': '12.60%', 'elapsed_time': '2h 30m 7s', 'remaining_time': '17h 21m 20s'}
+Train:  13%|█▎        | 630/5000 [2:30:07<17:19:42, 14.28s/it]Train:  13%|█▎        | 630/5000 [2:30:07<17:19:42, 14.28s/it]Train:  13%|█▎        | 631/5000 [2:30:21<17:19:04, 14.27s/it]Train:  13%|█▎        | 632/5000 [2:30:36<17:19:00, 14.27s/it]Train:  13%|█▎        | 633/5000 [2:30:50<17:18:34, 14.27s/it]Train:  13%|█▎        | 634/5000 [2:31:04<17:18:26, 14.27s/it]Train:  13%|█▎        | 635/5000 [2:31:18<17:18:56, 14.28s/it]Train:  13%|█▎        | 636/5000 [2:31:33<17:19:04, 14.29s/it]Train:  13%|█▎        | 637/5000 [2:31:47<17:18:53, 14.29s/it]Train:  13%|█▎        | 638/5000 [2:32:01<17:18:17, 14.28s/it]Train:  13%|█▎        | 639/5000 [2:32:15<17:17:53, 14.28s/it]Train:  13%|█▎        | 640/5000 [2:32:30<17:16:56, 14.27s/it]                                                              {'loss': 1.81558914, 'token_acc': 0.60535888, 'grad_norm': 0.26903766, 'learning_rate': 1.967e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06983, 'epoch': 0.13, 'global_step/max_steps': '640/5000', 'percentage': '12.80%', 'elapsed_time': '2h 32m 30s', 'remaining_time': '17h 18m 55s'}
+Train:  13%|█▎        | 640/5000 [2:32:30<17:16:56, 14.27s/it]Train:  13%|█▎        | 640/5000 [2:32:30<17:16:56, 14.27s/it]Train:  13%|█▎        | 641/5000 [2:32:44<17:16:41, 14.27s/it]Train:  13%|█▎        | 642/5000 [2:32:58<17:16:13, 14.27s/it]Train:  13%|█▎        | 643/5000 [2:33:13<17:16:48, 14.28s/it]Train:  13%|█▎        | 644/5000 [2:33:27<17:16:14, 14.27s/it]Train:  13%|█▎        | 645/5000 [2:33:41<17:16:21, 14.28s/it]Train:  13%|█▎        | 646/5000 [2:33:55<17:16:22, 14.28s/it]Train:  13%|█▎        | 647/5000 [2:34:10<17:16:34, 14.29s/it]Train:  13%|█▎        | 648/5000 [2:34:24<17:15:58, 14.28s/it]Train:  13%|█▎        | 649/5000 [2:34:38<17:15:29, 14.28s/it]Train:  13%|█▎        | 650/5000 [2:34:52<17:14:35, 14.27s/it]                                                              {'loss': 1.80640278, 'token_acc': 0.60911242, 'grad_norm': 0.27597925, 'learning_rate': 1.965e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069833, 'epoch': 0.13, 'global_step/max_steps': '650/5000', 'percentage': '13.00%', 'elapsed_time': '2h 34m 52s', 'remaining_time': '17h 16m 31s'}
+Train:  13%|█▎        | 650/5000 [2:34:52<17:14:35, 14.27s/it]Train:  13%|█▎        | 650/5000 [2:34:53<17:14:35, 14.27s/it]Train:  13%|█▎        | 651/5000 [2:35:07<17:13:08, 14.25s/it]Train:  13%|█▎        | 652/5000 [2:35:21<17:12:52, 14.25s/it]Train:  13%|█▎        | 653/5000 [2:35:35<17:13:09, 14.26s/it]Train:  13%|█▎        | 654/5000 [2:35:50<17:13:14, 14.26s/it]Train:  13%|█▎        | 655/5000 [2:36:04<17:13:00, 14.26s/it]Train:  13%|█▎        | 656/5000 [2:36:18<17:13:01, 14.27s/it]Train:  13%|█▎        | 657/5000 [2:36:32<17:12:29, 14.26s/it]Train:  13%|█▎        | 658/5000 [2:36:47<17:11:42, 14.26s/it]Train:  13%|█▎        | 659/5000 [2:37:01<17:10:47, 14.25s/it]Train:  13%|█▎        | 660/5000 [2:37:15<17:10:06, 14.24s/it]                                                              {'loss': 1.81658897, 'token_acc': 0.60638502, 'grad_norm': 0.26028413, 'learning_rate': 1.963e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069838, 'epoch': 0.13, 'global_step/max_steps': '660/5000', 'percentage': '13.20%', 'elapsed_time': '2h 37m 15s', 'remaining_time': '17h 14m 5s'}
+Train:  13%|█▎        | 660/5000 [2:37:15<17:10:06, 14.24s/it]Train:  13%|█▎        | 660/5000 [2:37:15<17:10:06, 14.24s/it]Train:  13%|█▎        | 661/5000 [2:37:29<17:09:27, 14.24s/it]Train:  13%|█▎        | 662/5000 [2:37:43<17:09:30, 14.24s/it]Train:  13%|█▎        | 663/5000 [2:37:58<17:09:01, 14.24s/it]Train:  13%|█▎        | 664/5000 [2:38:12<17:08:54, 14.24s/it]Train:  13%|█▎        | 665/5000 [2:38:26<17:08:52, 14.24s/it]Train:  13%|█▎        | 666/5000 [2:38:40<17:08:12, 14.23s/it]Train:  13%|█▎        | 667/5000 [2:38:55<17:08:19, 14.24s/it]Train:  13%|█▎        | 668/5000 [2:39:09<17:09:18, 14.26s/it]Train:  13%|█▎        | 669/5000 [2:39:23<17:07:40, 14.24s/it]Train:  13%|█▎        | 670/5000 [2:39:37<17:08:13, 14.25s/it]                                                              {'loss': 1.79841461, 'token_acc': 0.60439564, 'grad_norm': 0.26328433, 'learning_rate': 1.962e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069844, 'epoch': 0.13, 'global_step/max_steps': '670/5000', 'percentage': '13.40%', 'elapsed_time': '2h 39m 37s', 'remaining_time': '17h 11m 39s'}
+Train:  13%|█▎        | 670/5000 [2:39:37<17:08:13, 14.25s/it]Train:  13%|█▎        | 670/5000 [2:39:37<17:08:13, 14.25s/it]Train:  13%|█▎        | 671/5000 [2:39:52<17:08:31, 14.26s/it]Train:  13%|█▎        | 672/5000 [2:40:06<17:08:00, 14.25s/it]Train:  13%|█▎        | 673/5000 [2:40:20<17:07:46, 14.25s/it]Train:  13%|█▎        | 674/5000 [2:40:34<17:08:08, 14.26s/it]Train:  14%|█▎        | 675/5000 [2:40:49<17:07:42, 14.26s/it]Train:  14%|█▎        | 676/5000 [2:41:03<17:07:27, 14.26s/it]Train:  14%|█▎        | 677/5000 [2:41:17<17:07:24, 14.26s/it]Train:  14%|█▎        | 678/5000 [2:41:32<17:07:39, 14.27s/it]Train:  14%|█▎        | 679/5000 [2:41:46<17:07:54, 14.27s/it]Train:  14%|█▎        | 680/5000 [2:42:00<17:07:16, 14.27s/it]                                                              {'loss': 1.80759945, 'token_acc': 0.60697634, 'grad_norm': 0.26927221, 'learning_rate': 1.96e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069847, 'epoch': 0.14, 'global_step/max_steps': '680/5000', 'percentage': '13.60%', 'elapsed_time': '2h 42m 0s', 'remaining_time': '17h 9m 14s'}
+Train:  14%|█▎        | 680/5000 [2:42:00<17:07:16, 14.27s/it]Train:  14%|█▎        | 680/5000 [2:42:00<17:07:16, 14.27s/it]Train:  14%|█▎        | 681/5000 [2:42:14<17:07:27, 14.27s/it]Train:  14%|█▎        | 682/5000 [2:42:29<17:08:40, 14.29s/it]Train:  14%|█▎        | 683/5000 [2:42:43<17:08:57, 14.30s/it]Train:  14%|█▎        | 684/5000 [2:42:57<17:07:29, 14.28s/it]Train:  14%|█▎        | 685/5000 [2:43:12<17:07:10, 14.28s/it]Train:  14%|█▎        | 686/5000 [2:43:26<17:07:27, 14.29s/it]Train:  14%|█▎        | 687/5000 [2:43:40<17:06:48, 14.28s/it]Train:  14%|█▍        | 688/5000 [2:43:54<17:06:18, 14.28s/it]Train:  14%|█▍        | 689/5000 [2:44:09<17:06:26, 14.29s/it]Train:  14%|█▍        | 690/5000 [2:44:23<17:05:51, 14.28s/it]                                                              {'loss': 1.79511185, 'token_acc': 0.60908042, 'grad_norm': 0.27707371, 'learning_rate': 1.958e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069849, 'epoch': 0.14, 'global_step/max_steps': '690/5000', 'percentage': '13.80%', 'elapsed_time': '2h 44m 23s', 'remaining_time': '17h 6m 50s'}
+Train:  14%|█▍        | 690/5000 [2:44:23<17:05:51, 14.28s/it]Train:  14%|█▍        | 690/5000 [2:44:23<17:05:51, 14.28s/it]Train:  14%|█▍        | 691/5000 [2:44:37<17:05:36, 14.28s/it]Train:  14%|█▍        | 692/5000 [2:44:52<17:06:07, 14.29s/it]Train:  14%|█▍        | 693/5000 [2:45:06<17:05:32, 14.29s/it]Train:  14%|█▍        | 694/5000 [2:45:20<17:04:52, 14.28s/it]Train:  14%|█▍        | 695/5000 [2:45:34<17:04:41, 14.28s/it]Train:  14%|█▍        | 696/5000 [2:45:49<17:04:29, 14.28s/it]Train:  14%|█▍        | 697/5000 [2:46:03<17:04:06, 14.28s/it]Train:  14%|█▍        | 698/5000 [2:46:17<17:04:09, 14.28s/it]Train:  14%|█▍        | 699/5000 [2:46:32<17:04:13, 14.29s/it]Train:  14%|█▍        | 700/5000 [2:46:46<17:04:14, 14.29s/it]                                                              {'loss': 1.80035152, 'token_acc': 0.61637132, 'grad_norm': 0.47859827, 'learning_rate': 1.956e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069851, 'epoch': 0.14, 'global_step/max_steps': '700/5000', 'percentage': '14.00%', 'elapsed_time': '2h 46m 46s', 'remaining_time': '17h 4m 27s'}
+Train:  14%|█▍        | 700/5000 [2:46:46<17:04:14, 14.29s/it]Train:  14%|█▍        | 700/5000 [2:46:46<17:04:14, 14.29s/it]Train:  14%|█▍        | 701/5000 [2:47:00<17:05:06, 14.31s/it]Train:  14%|█▍        | 702/5000 [2:47:14<17:05:08, 14.31s/it]Train:  14%|█▍        | 703/5000 [2:47:29<17:04:57, 14.31s/it]Train:  14%|█▍        | 704/5000 [2:47:43<17:04:13, 14.30s/it]Train:  14%|█▍        | 705/5000 [2:47:57<17:03:39, 14.30s/it]Train:  14%|█▍        | 706/5000 [2:48:12<17:03:39, 14.30s/it]Train:  14%|█▍        | 707/5000 [2:48:26<17:02:10, 14.29s/it]Train:  14%|█▍        | 708/5000 [2:48:40<17:06:53, 14.36s/it]Train:  14%|█▍        | 709/5000 [2:48:55<17:05:53, 14.34s/it]Train:  14%|█▍        | 710/5000 [2:49:09<17:04:59, 14.34s/it]                                                              {'loss': 1.79893837, 'token_acc': 0.60892331, 'grad_norm': 0.25647724, 'learning_rate': 1.954e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069851, 'epoch': 0.14, 'global_step/max_steps': '710/5000', 'percentage': '14.20%', 'elapsed_time': '2h 49m 9s', 'remaining_time': '17h 2m 6s'}
+Train:  14%|█▍        | 710/5000 [2:49:09<17:04:59, 14.34s/it]Train:  14%|█▍        | 710/5000 [2:49:09<17:04:59, 14.34s/it]Train:  14%|█▍        | 711/5000 [2:49:23<17:03:49, 14.32s/it]Train:  14%|█▍        | 712/5000 [2:49:38<17:09:25, 14.40s/it]Train:  14%|█▍        | 713/5000 [2:49:52<17:07:20, 14.38s/it]Train:  14%|█▍        | 714/5000 [2:50:07<17:06:16, 14.37s/it]Train:  14%|█▍        | 715/5000 [2:50:21<17:06:04, 14.37s/it]Train:  14%|█▍        | 716/5000 [2:50:36<17:11:21, 14.44s/it]Train:  14%|█▍        | 717/5000 [2:50:50<17:07:21, 14.39s/it]Train:  14%|█▍        | 718/5000 [2:51:04<17:03:58, 14.35s/it]Train:  14%|█▍        | 719/5000 [2:51:18<17:02:12, 14.33s/it]Train:  14%|█▍        | 720/5000 [2:51:33<17:01:22, 14.32s/it]                                                              {'loss': 1.79390621, 'token_acc': 0.61099339, 'grad_norm': 0.25597, 'learning_rate': 1.952e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069848, 'epoch': 0.14, 'global_step/max_steps': '720/5000', 'percentage': '14.40%', 'elapsed_time': '2h 51m 33s', 'remaining_time': '16h 59m 47s'}
+Train:  14%|█▍        | 720/5000 [2:51:33<17:01:22, 14.32s/it]Train:  14%|█▍        | 720/5000 [2:51:33<17:01:22, 14.32s/it]Train:  14%|█▍        | 721/5000 [2:51:47<17:00:28, 14.31s/it]Train:  14%|█▍        | 722/5000 [2:52:01<16:59:52, 14.30s/it]Train:  14%|█▍        | 723/5000 [2:52:16<16:59:04, 14.30s/it]Train:  14%|█▍        | 724/5000 [2:52:30<16:57:37, 14.28s/it]Train:  14%|█▍        | 725/5000 [2:52:44<16:58:22, 14.29s/it]Train:  15%|█▍        | 726/5000 [2:52:58<16:58:11, 14.29s/it]Train:  15%|█▍        | 727/5000 [2:53:13<16:58:21, 14.30s/it]Train:  15%|█▍        | 728/5000 [2:53:27<16:57:02, 14.28s/it]Train:  15%|█▍        | 729/5000 [2:53:41<16:56:52, 14.29s/it]Train:  15%|█▍        | 730/5000 [2:53:56<16:55:29, 14.27s/it]                                                              {'loss': 1.81070137, 'token_acc': 0.60462273, 'grad_norm': 0.26223978, 'learning_rate': 1.95e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06985, 'epoch': 0.15, 'global_step/max_steps': '730/5000', 'percentage': '14.60%', 'elapsed_time': '2h 53m 56s', 'remaining_time': '16h 57m 23s'}
+Train:  15%|█▍        | 730/5000 [2:53:56<16:55:29, 14.27s/it]Train:  15%|█▍        | 730/5000 [2:53:56<16:55:29, 14.27s/it]Train:  15%|█▍        | 731/5000 [2:54:10<16:55:46, 14.28s/it]Train:  15%|█▍        | 732/5000 [2:54:24<16:56:26, 14.29s/it]Train:  15%|█▍        | 733/5000 [2:54:38<16:55:45, 14.28s/it]Train:  15%|█▍        | 734/5000 [2:54:53<16:55:26, 14.28s/it]Train:  15%|█▍        | 735/5000 [2:55:07<16:55:27, 14.29s/it]Train:  15%|█▍        | 736/5000 [2:55:21<16:54:46, 14.28s/it]Train:  15%|█▍        | 737/5000 [2:55:36<16:55:07, 14.29s/it]Train:  15%|█▍        | 738/5000 [2:55:50<16:54:54, 14.29s/it]Train:  15%|█▍        | 739/5000 [2:56:04<16:54:10, 14.28s/it]Train:  15%|█▍        | 740/5000 [2:56:18<16:53:49, 14.28s/it]                                                              {'loss': 1.79587479, 'token_acc': 0.61292751, 'grad_norm': 0.25875929, 'learning_rate': 1.948e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069852, 'epoch': 0.15, 'global_step/max_steps': '740/5000', 'percentage': '14.80%', 'elapsed_time': '2h 56m 18s', 'remaining_time': '16h 55m 0s'}
+Train:  15%|█▍        | 740/5000 [2:56:18<16:53:49, 14.28s/it]Train:  15%|█▍        | 740/5000 [2:56:18<16:53:49, 14.28s/it]Train:  15%|█▍        | 741/5000 [2:56:33<16:54:00, 14.29s/it]Train:  15%|█▍        | 742/5000 [2:56:47<16:53:20, 14.28s/it]Train:  15%|█▍        | 743/5000 [2:57:01<16:52:19, 14.27s/it]Train:  15%|█▍        | 744/5000 [2:57:15<16:50:56, 14.25s/it]Train:  15%|█▍        | 745/5000 [2:57:30<16:51:31, 14.26s/it]Train:  15%|█▍        | 746/5000 [2:57:44<16:51:28, 14.27s/it]Train:  15%|█▍        | 747/5000 [2:57:58<16:49:46, 14.25s/it]Train:  15%|█▍        | 748/5000 [2:58:12<16:50:43, 14.26s/it]Train:  15%|█▍        | 749/5000 [2:58:27<16:50:19, 14.26s/it]Train:  15%|█▌        | 750/5000 [2:58:41<16:50:30, 14.27s/it]                                                              {'loss': 1.79970131, 'token_acc': 0.61360173, 'grad_norm': 0.26326153, 'learning_rate': 1.946e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069856, 'epoch': 0.15, 'global_step/max_steps': '750/5000', 'percentage': '15.00%', 'elapsed_time': '2h 58m 41s', 'remaining_time': '16h 52m 35s'}
+Train:  15%|█▌        | 750/5000 [2:58:41<16:50:30, 14.27s/it]Train:  15%|█▌        | 750/5000 [2:58:41<16:50:30, 14.27s/it]Train:  15%|█▌        | 751/5000 [2:58:55<16:50:22, 14.27s/it]Train:  15%|█▌        | 752/5000 [2:59:10<16:50:21, 14.27s/it]Train:  15%|█▌        | 753/5000 [2:59:24<16:50:22, 14.27s/it]Train:  15%|█▌        | 754/5000 [2:59:38<16:49:11, 14.26s/it]Train:  15%|█▌        | 755/5000 [2:59:52<16:50:00, 14.28s/it]Train:  15%|█▌        | 756/5000 [3:00:07<16:48:51, 14.26s/it]Train:  15%|█▌        | 757/5000 [3:00:21<16:47:36, 14.25s/it]Train:  15%|█▌        | 758/5000 [3:00:35<16:48:38, 14.27s/it]Train:  15%|█▌        | 759/5000 [3:00:49<16:48:19, 14.27s/it]Train:  15%|█▌        | 760/5000 [3:01:04<16:48:05, 14.27s/it]                                                              {'loss': 1.8005024, 'token_acc': 0.60930907, 'grad_norm': 0.26551321, 'learning_rate': 1.944e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069859, 'epoch': 0.15, 'global_step/max_steps': '760/5000', 'percentage': '15.20%', 'elapsed_time': '3h 1m 4s', 'remaining_time': '16h 50m 10s'}
+Train:  15%|█▌        | 760/5000 [3:01:04<16:48:05, 14.27s/it]Train:  15%|█▌        | 760/5000 [3:01:04<16:48:05, 14.27s/it]Train:  15%|█▌        | 761/5000 [3:01:18<16:47:20, 14.26s/it]Train:  15%|█▌        | 762/5000 [3:01:32<16:47:25, 14.26s/it]Train:  15%|█▌        | 763/5000 [3:01:46<16:46:45, 14.26s/it]Train:  15%|█▌        | 764/5000 [3:02:01<16:46:43, 14.26s/it]Train:  15%|█▌        | 765/5000 [3:02:15<16:46:41, 14.26s/it]Train:  15%|█▌        | 766/5000 [3:02:29<16:46:28, 14.26s/it]Train:  15%|█▌        | 767/5000 [3:02:43<16:45:11, 14.25s/it]Train:  15%|█▌        | 768/5000 [3:02:58<16:44:41, 14.24s/it]Train:  15%|█▌        | 769/5000 [3:03:12<16:44:46, 14.25s/it]Train:  15%|█▌        | 770/5000 [3:03:26<16:44:36, 14.25s/it]                                                              {'loss': 1.80101242, 'token_acc': 0.61385309, 'grad_norm': 0.26125979, 'learning_rate': 1.941e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069863, 'epoch': 0.15, 'global_step/max_steps': '770/5000', 'percentage': '15.40%', 'elapsed_time': '3h 3m 26s', 'remaining_time': '16h 47m 45s'}
+Train:  15%|█▌        | 770/5000 [3:03:26<16:44:36, 14.25s/it]Train:  15%|█▌        | 770/5000 [3:03:26<16:44:36, 14.25s/it]Train:  15%|█▌        | 771/5000 [3:03:40<16:45:35, 14.27s/it]Train:  15%|█▌        | 772/5000 [3:03:55<16:45:35, 14.27s/it]Train:  15%|█▌        | 773/5000 [3:04:09<16:46:05, 14.28s/it]Train:  15%|█▌        | 774/5000 [3:04:23<16:46:25, 14.29s/it]Train:  16%|█▌        | 775/5000 [3:04:38<16:45:58, 14.29s/it]Train:  16%|█▌        | 776/5000 [3:04:52<16:45:33, 14.28s/it]Train:  16%|█▌        | 777/5000 [3:05:06<16:45:38, 14.29s/it]Train:  16%|█▌        | 778/5000 [3:05:21<16:46:01, 14.30s/it]Train:  16%|█▌        | 779/5000 [3:05:35<16:46:04, 14.30s/it]Train:  16%|█▌        | 780/5000 [3:05:49<16:44:29, 14.28s/it]                                                              {'loss': 1.79220543, 'token_acc': 0.61458868, 'grad_norm': 0.25331017, 'learning_rate': 1.939e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069864, 'epoch': 0.16, 'global_step/max_steps': '780/5000', 'percentage': '15.60%', 'elapsed_time': '3h 5m 49s', 'remaining_time': '16h 45m 22s'}
+Train:  16%|█▌        | 780/5000 [3:05:49<16:44:29, 14.28s/it]Train:  16%|█▌        | 780/5000 [3:05:49<16:44:29, 14.28s/it]Train:  16%|█▌        | 781/5000 [3:06:03<16:43:35, 14.27s/it]Train:  16%|█▌        | 782/5000 [3:06:18<16:42:51, 14.27s/it]Train:  16%|█▌        | 783/5000 [3:06:32<16:42:57, 14.27s/it]Train:  16%|█▌        | 784/5000 [3:06:46<16:42:06, 14.26s/it]Train:  16%|█▌        | 785/5000 [3:07:00<16:41:42, 14.26s/it]Train:  16%|█▌        | 786/5000 [3:07:15<16:42:10, 14.27s/it]Train:  16%|█▌        | 787/5000 [3:07:29<16:41:01, 14.26s/it]Train:  16%|█▌        | 788/5000 [3:07:43<16:40:13, 14.25s/it]Train:  16%|█▌        | 789/5000 [3:07:57<16:39:54, 14.25s/it]Train:  16%|█▌        | 790/5000 [3:08:12<16:39:20, 14.24s/it]                                                              {'loss': 1.80002575, 'token_acc': 0.60740542, 'grad_norm': 0.2567344, 'learning_rate': 1.937e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069868, 'epoch': 0.16, 'global_step/max_steps': '790/5000', 'percentage': '15.80%', 'elapsed_time': '3h 8m 12s', 'remaining_time': '16h 42m 56s'}
+Train:  16%|█▌        | 790/5000 [3:08:12<16:39:20, 14.24s/it]Train:  16%|█▌        | 790/5000 [3:08:12<16:39:20, 14.24s/it]Train:  16%|█▌        | 791/5000 [3:08:26<16:40:18, 14.26s/it]Train:  16%|█▌        | 792/5000 [3:08:40<16:40:06, 14.26s/it]Train:  16%|█▌        | 793/5000 [3:08:54<16:40:16, 14.27s/it]Train:  16%|█▌        | 794/5000 [3:09:09<16:39:44, 14.26s/it]Train:  16%|█▌        | 795/5000 [3:09:23<16:38:53, 14.25s/it]Train:  16%|█▌        | 796/5000 [3:09:37<16:39:24, 14.26s/it]Train:  16%|█▌        | 797/5000 [3:09:51<16:39:58, 14.28s/it]Train:  16%|█▌        | 798/5000 [3:10:06<16:39:23, 14.27s/it]Train:  16%|█▌        | 799/5000 [3:10:20<16:39:08, 14.27s/it]Train:  16%|█▌        | 800/5000 [3:10:34<16:38:25, 14.26s/it]                                                              {'loss': 1.80745888, 'token_acc': 0.60814296, 'grad_norm': 0.26573434, 'learning_rate': 1.935e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069871, 'epoch': 0.16, 'global_step/max_steps': '800/5000', 'percentage': '16.00%', 'elapsed_time': '3h 10m 34s', 'remaining_time': '16h 40m 32s'}
+Train:  16%|█▌        | 800/5000 [3:10:34<16:38:25, 14.26s/it]Train:  16%|█▌        | 800/5000 [3:10:34<16:38:25, 14.26s/it]Train:  16%|█▌        | 801/5000 [3:10:49<16:38:39, 14.27s/it]Train:  16%|█▌        | 802/5000 [3:11:03<16:38:18, 14.27s/it]Train:  16%|█▌        | 803/5000 [3:11:17<16:37:46, 14.26s/it]Train:  16%|█▌        | 804/5000 [3:11:31<16:37:50, 14.27s/it]Train:  16%|█▌        | 805/5000 [3:11:46<16:37:27, 14.27s/it]Train:  16%|█▌        | 806/5000 [3:12:00<16:37:09, 14.27s/it]Train:  16%|█▌        | 807/5000 [3:12:14<16:37:19, 14.27s/it]Train:  16%|█▌        | 808/5000 [3:12:29<16:38:39, 14.29s/it]Train:  16%|█▌        | 809/5000 [3:12:43<16:38:02, 14.29s/it]Train:  16%|█▌        | 810/5000 [3:12:57<16:37:48, 14.29s/it]                                                              {'loss': 1.78615303, 'token_acc': 0.60953638, 'grad_norm': 0.26412085, 'learning_rate': 1.932e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069873, 'epoch': 0.16, 'global_step/max_steps': '810/5000', 'percentage': '16.20%', 'elapsed_time': '3h 12m 57s', 'remaining_time': '16h 38m 8s'}
+Train:  16%|█▌        | 810/5000 [3:12:57<16:37:48, 14.29s/it]Train:  16%|█▌        | 810/5000 [3:12:57<16:37:48, 14.29s/it]Train:  16%|█▌        | 811/5000 [3:13:11<16:37:11, 14.28s/it]Train:  16%|█▌        | 812/5000 [3:13:26<16:36:57, 14.28s/it]Train:  16%|█▋        | 813/5000 [3:13:40<16:37:10, 14.29s/it]Train:  16%|█▋        | 814/5000 [3:13:54<16:36:58, 14.29s/it]Train:  16%|█▋        | 815/5000 [3:14:08<16:36:19, 14.28s/it]Train:  16%|█▋        | 816/5000 [3:14:23<16:35:42, 14.28s/it]Train:  16%|█▋        | 817/5000 [3:14:37<16:34:56, 14.27s/it]Train:  16%|█▋        | 818/5000 [3:14:51<16:35:02, 14.28s/it]Train:  16%|█▋        | 819/5000 [3:15:06<16:35:37, 14.29s/it]Train:  16%|█▋        | 820/5000 [3:15:20<16:34:49, 14.28s/it]                                                              {'loss': 1.78626518, 'token_acc': 0.6172106, 'grad_norm': 0.26619133, 'learning_rate': 1.93e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069875, 'epoch': 0.16, 'global_step/max_steps': '820/5000', 'percentage': '16.40%', 'elapsed_time': '3h 15m 20s', 'remaining_time': '16h 35m 45s'}
+Train:  16%|█▋        | 820/5000 [3:15:20<16:34:49, 14.28s/it]Train:  16%|█▋        | 820/5000 [3:15:20<16:34:49, 14.28s/it]Train:  16%|█▋        | 821/5000 [3:15:34<16:34:51, 14.28s/it]Train:  16%|█▋        | 822/5000 [3:15:48<16:34:36, 14.28s/it]Train:  16%|█▋        | 823/5000 [3:16:03<16:34:47, 14.29s/it]Train:  16%|█▋        | 824/5000 [3:16:17<16:33:47, 14.28s/it]Train:  16%|█▋        | 825/5000 [3:16:31<16:33:39, 14.28s/it]Train:  17%|█▋        | 826/5000 [3:16:46<16:32:57, 14.27s/it]Train:  17%|█▋        | 827/5000 [3:17:00<16:32:49, 14.27s/it]Train:  17%|█▋        | 828/5000 [3:17:14<16:32:24, 14.27s/it]Train:  17%|█▋        | 829/5000 [3:17:28<16:32:56, 14.28s/it]Train:  17%|█▋        | 830/5000 [3:17:43<16:31:48, 14.27s/it]                                                              {'loss': 1.80041103, 'token_acc': 0.61366175, 'grad_norm': 0.27509767, 'learning_rate': 1.927e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069877, 'epoch': 0.17, 'global_step/max_steps': '830/5000', 'percentage': '16.60%', 'elapsed_time': '3h 17m 43s', 'remaining_time': '16h 33m 21s'}
+Train:  17%|█▋        | 830/5000 [3:17:43<16:31:48, 14.27s/it]Train:  17%|█▋        | 830/5000 [3:17:43<16:31:48, 14.27s/it]Train:  17%|█▋        | 831/5000 [3:17:57<16:32:02, 14.28s/it]Train:  17%|█▋        | 832/5000 [3:18:11<16:35:36, 14.33s/it]Train:  17%|█▋        | 833/5000 [3:18:26<16:34:05, 14.31s/it]Train:  17%|█▋        | 834/5000 [3:18:40<16:33:11, 14.30s/it]Train:  17%|█▋        | 835/5000 [3:18:54<16:32:28, 14.30s/it]Train:  17%|█▋        | 836/5000 [3:19:09<16:32:37, 14.30s/it]Train:  17%|█▋        | 837/5000 [3:19:23<16:31:17, 14.29s/it]Train:  17%|█▋        | 838/5000 [3:19:37<16:31:08, 14.29s/it]Train:  17%|█▋        | 839/5000 [3:19:51<16:29:51, 14.27s/it]Train:  17%|█▋        | 840/5000 [3:20:06<16:29:13, 14.27s/it]                                                              {'loss': 1.77238083, 'token_acc': 0.61401457, 'grad_norm': 0.26906928, 'learning_rate': 1.925e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069878, 'epoch': 0.17, 'global_step/max_steps': '840/5000', 'percentage': '16.80%', 'elapsed_time': '3h 20m 6s', 'remaining_time': '16h 30m 58s'}
+Train:  17%|█▋        | 840/5000 [3:20:06<16:29:13, 14.27s/it]Train:  17%|█▋        | 840/5000 [3:20:06<16:29:13, 14.27s/it]Train:  17%|█▋        | 841/5000 [3:20:20<16:28:58, 14.27s/it]Train:  17%|█▋        | 842/5000 [3:20:34<16:28:34, 14.27s/it]Train:  17%|█▋        | 843/5000 [3:20:48<16:27:07, 14.25s/it]Train:  17%|█▋        | 844/5000 [3:21:03<16:27:27, 14.26s/it]Train:  17%|█▋        | 845/5000 [3:21:17<16:28:17, 14.27s/it]Train:  17%|█▋        | 846/5000 [3:21:31<16:28:17, 14.27s/it]Train:  17%|█▋        | 847/5000 [3:21:46<16:28:52, 14.29s/it]Train:  17%|█▋        | 848/5000 [3:22:00<16:27:58, 14.28s/it]Train:  17%|█▋        | 849/5000 [3:22:14<16:27:43, 14.28s/it]Train:  17%|█▋        | 850/5000 [3:22:28<16:27:09, 14.27s/it]                                                              {'loss': 1.79507504, 'token_acc': 0.60888982, 'grad_norm': 0.26202893, 'learning_rate': 1.922e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06988, 'epoch': 0.17, 'global_step/max_steps': '850/5000', 'percentage': '17.00%', 'elapsed_time': '3h 22m 28s', 'remaining_time': '16h 28m 34s'}
+Train:  17%|█▋        | 850/5000 [3:22:28<16:27:09, 14.27s/it]Train:  17%|█▋        | 850/5000 [3:22:28<16:27:09, 14.27s/it]Train:  17%|█▋        | 851/5000 [3:22:43<16:26:32, 14.27s/it]Train:  17%|█▋        | 852/5000 [3:22:57<16:26:17, 14.27s/it]Train:  17%|█▋        | 853/5000 [3:23:11<16:26:07, 14.27s/it]Train:  17%|█▋        | 854/5000 [3:23:25<16:26:24, 14.28s/it]Train:  17%|█▋        | 855/5000 [3:23:40<16:25:14, 14.26s/it]Train:  17%|█▋        | 856/5000 [3:23:54<16:24:03, 14.25s/it]Train:  17%|█▋        | 857/5000 [3:24:08<16:23:45, 14.25s/it]Train:  17%|█▋        | 858/5000 [3:24:22<16:24:04, 14.26s/it]Train:  17%|█▋        | 859/5000 [3:24:37<16:23:27, 14.25s/it]Train:  17%|█▋        | 860/5000 [3:24:51<16:23:50, 14.26s/it]                                                              {'loss': 1.77890491, 'token_acc': 0.61119291, 'grad_norm': 0.24806936, 'learning_rate': 1.92e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069883, 'epoch': 0.17, 'global_step/max_steps': '860/5000', 'percentage': '17.20%', 'elapsed_time': '3h 24m 51s', 'remaining_time': '16h 26m 10s'}
+Train:  17%|█▋        | 860/5000 [3:24:51<16:23:50, 14.26s/it]Train:  17%|█▋        | 860/5000 [3:24:51<16:23:50, 14.26s/it]Train:  17%|█▋        | 861/5000 [3:25:05<16:23:12, 14.25s/it]Train:  17%|█▋        | 862/5000 [3:25:19<16:22:38, 14.25s/it]Train:  17%|█▋        | 863/5000 [3:25:34<16:22:18, 14.25s/it]Train:  17%|█▋        | 864/5000 [3:25:48<16:22:09, 14.25s/it]Train:  17%|█▋        | 865/5000 [3:26:02<16:22:11, 14.25s/it]Train:  17%|█▋        | 866/5000 [3:26:16<16:22:02, 14.25s/it]Train:  17%|█▋        | 867/5000 [3:26:31<16:22:36, 14.26s/it]Train:  17%|█▋        | 868/5000 [3:26:45<16:22:56, 14.27s/it]Train:  17%|█▋        | 869/5000 [3:26:59<16:22:21, 14.27s/it]Train:  17%|█▋        | 870/5000 [3:27:13<16:22:33, 14.27s/it]                                                              {'loss': 1.80746498, 'token_acc': 0.61220558, 'grad_norm': 0.25171104, 'learning_rate': 1.917e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069886, 'epoch': 0.17, 'global_step/max_steps': '870/5000', 'percentage': '17.40%', 'elapsed_time': '3h 27m 13s', 'remaining_time': '16h 23m 45s'}
+Train:  17%|█▋        | 870/5000 [3:27:13<16:22:33, 14.27s/it]Train:  17%|█▋        | 870/5000 [3:27:13<16:22:33, 14.27s/it]Train:  17%|█▋        | 871/5000 [3:27:28<16:21:36, 14.26s/it]Train:  17%|█▋        | 872/5000 [3:27:42<16:20:50, 14.26s/it]Train:  17%|█▋        | 873/5000 [3:27:56<16:20:43, 14.26s/it]Train:  17%|█▋        | 874/5000 [3:28:10<16:20:45, 14.26s/it]Train:  18%|█▊        | 875/5000 [3:28:25<16:19:52, 14.25s/it]Train:  18%|█▊        | 876/5000 [3:28:39<16:19:55, 14.26s/it]Train:  18%|█▊        | 877/5000 [3:28:53<16:19:02, 14.25s/it]Train:  18%|█▊        | 878/5000 [3:29:08<16:19:46, 14.26s/it]Train:  18%|█▊        | 879/5000 [3:29:22<16:19:33, 14.26s/it]Train:  18%|█▊        | 880/5000 [3:29:36<16:19:53, 14.27s/it]                                                              {'loss': 1.78831673, 'token_acc': 0.61089294, 'grad_norm': 0.27332497, 'learning_rate': 1.914e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069888, 'epoch': 0.18, 'global_step/max_steps': '880/5000', 'percentage': '17.60%', 'elapsed_time': '3h 29m 36s', 'remaining_time': '16h 21m 21s'}
+Train:  18%|█▊        | 880/5000 [3:29:36<16:19:53, 14.27s/it]Train:  18%|█▊        | 880/5000 [3:29:36<16:19:53, 14.27s/it]Train:  18%|█▊        | 881/5000 [3:29:50<16:19:52, 14.27s/it]Train:  18%|█▊        | 882/5000 [3:30:05<16:19:11, 14.27s/it]Train:  18%|█▊        | 883/5000 [3:30:19<16:18:54, 14.27s/it]Train:  18%|█▊        | 884/5000 [3:30:33<16:18:16, 14.26s/it]Train:  18%|█▊        | 885/5000 [3:30:47<16:18:20, 14.27s/it]Train:  18%|█▊        | 886/5000 [3:31:02<16:18:00, 14.26s/it]Train:  18%|█▊        | 887/5000 [3:31:16<16:17:27, 14.26s/it]Train:  18%|█▊        | 888/5000 [3:31:30<16:17:14, 14.26s/it]Train:  18%|█▊        | 889/5000 [3:31:44<16:18:03, 14.27s/it]Train:  18%|█▊        | 890/5000 [3:31:59<16:17:20, 14.27s/it]                                                              {'loss': 1.78347893, 'token_acc': 0.6158653, 'grad_norm': 0.26379606, 'learning_rate': 1.912e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069891, 'epoch': 0.18, 'global_step/max_steps': '890/5000', 'percentage': '17.80%', 'elapsed_time': '3h 31m 59s', 'remaining_time': '16h 18m 57s'}
+Train:  18%|█▊        | 890/5000 [3:31:59<16:17:20, 14.27s/it]Train:  18%|█▊        | 890/5000 [3:31:59<16:17:20, 14.27s/it]Train:  18%|█▊        | 891/5000 [3:32:13<16:17:07, 14.27s/it]Train:  18%|█▊        | 892/5000 [3:32:27<16:18:32, 14.29s/it]Train:  18%|█▊        | 893/5000 [3:32:42<16:17:34, 14.28s/it]Train:  18%|█▊        | 894/5000 [3:32:56<16:17:55, 14.29s/it]Train:  18%|█▊        | 895/5000 [3:33:10<16:17:49, 14.29s/it]Train:  18%|█▊        | 896/5000 [3:33:24<16:17:10, 14.29s/it]Train:  18%|█▊        | 897/5000 [3:33:39<16:16:05, 14.27s/it]Train:  18%|█▊        | 898/5000 [3:33:53<16:15:34, 14.27s/it]Train:  18%|█▊        | 899/5000 [3:34:07<16:15:41, 14.28s/it]Train:  18%|█▊        | 900/5000 [3:34:22<16:15:15, 14.27s/it]                                                              {'loss': 1.78171062, 'token_acc': 0.60849624, 'grad_norm': 0.25964943, 'learning_rate': 1.909e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069892, 'epoch': 0.18, 'global_step/max_steps': '900/5000', 'percentage': '18.00%', 'elapsed_time': '3h 34m 22s', 'remaining_time': '16h 16m 33s'}
+Train:  18%|█▊        | 900/5000 [3:34:22<16:15:15, 14.27s/it]Train:  18%|█▊        | 900/5000 [3:34:22<16:15:15, 14.27s/it]Train:  18%|█▊        | 901/5000 [3:34:36<16:14:53, 14.27s/it]Train:  18%|█▊        | 902/5000 [3:34:50<16:14:43, 14.27s/it]Train:  18%|█▊        | 903/5000 [3:35:04<16:14:07, 14.27s/it]Train:  18%|█▊        | 904/5000 [3:35:19<16:13:14, 14.26s/it]Train:  18%|█▊        | 905/5000 [3:35:33<16:13:24, 14.26s/it]Train:  18%|█▊        | 906/5000 [3:35:47<16:13:47, 14.27s/it]Train:  18%|█▊        | 907/5000 [3:36:01<16:13:20, 14.27s/it]Train:  18%|█▊        | 908/5000 [3:36:16<16:13:29, 14.27s/it]Train:  18%|█▊        | 909/5000 [3:36:30<16:12:45, 14.27s/it]Train:  18%|█▊        | 910/5000 [3:36:44<16:12:31, 14.27s/it]                                                              {'loss': 1.79683533, 'token_acc': 0.60831336, 'grad_norm': 0.25274524, 'learning_rate': 1.906e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069895, 'epoch': 0.18, 'global_step/max_steps': '910/5000', 'percentage': '18.20%', 'elapsed_time': '3h 36m 44s', 'remaining_time': '16h 14m 9s'}
+Train:  18%|█▊        | 910/5000 [3:36:44<16:12:31, 14.27s/it]Train:  18%|█▊        | 910/5000 [3:36:44<16:12:31, 14.27s/it]Train:  18%|█▊        | 911/5000 [3:36:58<16:13:04, 14.28s/it]Train:  18%|█▊        | 912/5000 [3:37:13<16:12:16, 14.27s/it]Train:  18%|█▊        | 913/5000 [3:37:27<16:12:07, 14.27s/it]Train:  18%|█▊        | 914/5000 [3:37:41<16:11:59, 14.27s/it]Train:  18%|█▊        | 915/5000 [3:37:56<16:11:15, 14.27s/it]Train:  18%|█▊        | 916/5000 [3:38:10<16:11:40, 14.28s/it]Train:  18%|█▊        | 917/5000 [3:38:24<16:12:45, 14.29s/it]Train:  18%|█▊        | 918/5000 [3:38:38<16:11:49, 14.28s/it]Train:  18%|█▊        | 919/5000 [3:38:53<16:11:26, 14.28s/it]Train:  18%|█▊        | 920/5000 [3:39:07<16:11:55, 14.29s/it]                                                              {'loss': 1.78489361, 'token_acc': 0.61761506, 'grad_norm': 0.25765172, 'learning_rate': 1.903e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069896, 'epoch': 0.18, 'global_step/max_steps': '920/5000', 'percentage': '18.40%', 'elapsed_time': '3h 39m 7s', 'remaining_time': '16h 11m 46s'}
+Train:  18%|█▊        | 920/5000 [3:39:07<16:11:55, 14.29s/it]Train:  18%|█▊        | 920/5000 [3:39:07<16:11:55, 14.29s/it]Train:  18%|█▊        | 921/5000 [3:39:21<16:11:09, 14.29s/it]Train:  18%|█▊        | 922/5000 [3:39:36<16:11:11, 14.29s/it]Train:  18%|█▊        | 923/5000 [3:39:50<16:10:33, 14.28s/it]Train:  18%|█▊        | 924/5000 [3:40:04<16:10:36, 14.29s/it]Train:  18%|█▊        | 925/5000 [3:40:18<16:10:37, 14.29s/it]Train:  19%|█▊        | 926/5000 [3:40:33<16:10:07, 14.29s/it]Train:  19%|█▊        | 927/5000 [3:40:47<16:09:25, 14.28s/it]Train:  19%|█▊        | 928/5000 [3:41:01<16:08:08, 14.27s/it]Train:  19%|█▊        | 929/5000 [3:41:16<16:08:34, 14.28s/it]Train:  19%|█▊        | 930/5000 [3:41:30<16:08:22, 14.28s/it]                                                              {'loss': 1.7836689, 'token_acc': 0.61785175, 'grad_norm': 0.2534987, 'learning_rate': 1.901e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069897, 'epoch': 0.19, 'global_step/max_steps': '930/5000', 'percentage': '18.60%', 'elapsed_time': '3h 41m 30s', 'remaining_time': '16h 9m 23s'}
+Train:  19%|█▊        | 930/5000 [3:41:30<16:08:22, 14.28s/it]Train:  19%|█▊        | 930/5000 [3:41:30<16:08:22, 14.28s/it]Train:  19%|█▊        | 931/5000 [3:41:44<16:08:28, 14.28s/it]Train:  19%|█▊        | 932/5000 [3:41:58<16:08:29, 14.28s/it]Train:  19%|█▊        | 933/5000 [3:42:13<16:08:19, 14.29s/it]Train:  19%|█▊        | 934/5000 [3:42:27<16:07:48, 14.28s/it]Train:  19%|█▊        | 935/5000 [3:42:41<16:08:22, 14.29s/it]Train:  19%|█▊        | 936/5000 [3:42:56<16:07:26, 14.28s/it]Train:  19%|█▊        | 937/5000 [3:43:10<16:07:08, 14.28s/it]Train:  19%|█▉        | 938/5000 [3:43:24<16:08:00, 14.30s/it]Train:  19%|█▉        | 939/5000 [3:43:38<16:07:07, 14.29s/it]Train:  19%|█▉        | 940/5000 [3:43:53<16:06:31, 14.28s/it]                                                              {'loss': 1.77157936, 'token_acc': 0.61186716, 'grad_norm': 0.2569302, 'learning_rate': 1.898e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069898, 'epoch': 0.19, 'global_step/max_steps': '940/5000', 'percentage': '18.80%', 'elapsed_time': '3h 43m 53s', 'remaining_time': '16h 6m 59s'}
+Train:  19%|█▉        | 940/5000 [3:43:53<16:06:31, 14.28s/it]Train:  19%|█▉        | 940/5000 [3:43:53<16:06:31, 14.28s/it]Train:  19%|█▉        | 941/5000 [3:44:07<16:05:54, 14.28s/it]Train:  19%|█▉        | 942/5000 [3:44:21<16:05:34, 14.28s/it]Train:  19%|█▉        | 943/5000 [3:44:36<16:05:31, 14.28s/it]Train:  19%|█▉        | 944/5000 [3:44:50<16:06:45, 14.30s/it]Train:  19%|█▉        | 945/5000 [3:45:04<16:05:27, 14.29s/it]Train:  19%|█▉        | 946/5000 [3:45:18<16:04:21, 14.27s/it]Train:  19%|█▉        | 947/5000 [3:45:33<16:03:40, 14.27s/it]Train:  19%|█���        | 948/5000 [3:45:47<16:04:04, 14.28s/it]Train:  19%|█▉        | 949/5000 [3:46:01<16:03:28, 14.27s/it]Train:  19%|█▉        | 950/5000 [3:46:16<16:05:43, 14.31s/it]                                                              {'loss': 1.77213745, 'token_acc': 0.61111984, 'grad_norm': 0.25514475, 'learning_rate': 1.895e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069899, 'epoch': 0.19, 'global_step/max_steps': '950/5000', 'percentage': '19.00%', 'elapsed_time': '3h 46m 16s', 'remaining_time': '16h 4m 36s'}
+Train:  19%|█▉        | 950/5000 [3:46:16<16:05:43, 14.31s/it]Train:  19%|█▉        | 950/5000 [3:46:16<16:05:43, 14.31s/it]Train:  19%|█▉        | 951/5000 [3:46:30<16:04:16, 14.29s/it]Train:  19%|█▉        | 952/5000 [3:46:44<16:02:52, 14.27s/it]Train:  19%|█▉        | 953/5000 [3:46:58<16:01:09, 14.25s/it]Train:  19%|█▉        | 954/5000 [3:47:12<16:00:27, 14.24s/it]Train:  19%|█▉        | 955/5000 [3:47:27<16:00:56, 14.25s/it]Train:  19%|█▉        | 956/5000 [3:47:41<16:00:37, 14.25s/it]Train:  19%|█▉        | 957/5000 [3:47:55<15:59:40, 14.24s/it]Train:  19%|█▉        | 958/5000 [3:48:09<15:59:43, 14.25s/it]Train:  19%|█▉        | 959/5000 [3:48:24<16:00:38, 14.26s/it]Train:  19%|█▉        | 960/5000 [3:48:38<16:00:40, 14.27s/it]                                                              {'loss': 1.78173332, 'token_acc': 0.61612224, 'grad_norm': 0.28978261, 'learning_rate': 1.892e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069902, 'epoch': 0.19, 'global_step/max_steps': '960/5000', 'percentage': '19.20%', 'elapsed_time': '3h 48m 38s', 'remaining_time': '16h 2m 12s'}
+Train:  19%|█▉        | 960/5000 [3:48:38<16:00:40, 14.27s/it]Train:  19%|█▉        | 960/5000 [3:48:38<16:00:40, 14.27s/it]Train:  19%|█▉        | 961/5000 [3:48:52<16:00:45, 14.27s/it]Train:  19%|█▉        | 962/5000 [3:49:07<15:59:46, 14.26s/it]Train:  19%|█▉        | 963/5000 [3:49:21<15:59:50, 14.27s/it]Train:  19%|█▉        | 964/5000 [3:49:35<15:59:41, 14.27s/it]Train:  19%|█▉        | 965/5000 [3:49:49<15:58:57, 14.26s/it]Train:  19%|█▉        | 966/5000 [3:50:04<15:59:23, 14.27s/it]Train:  19%|█▉        | 967/5000 [3:50:18<15:58:42, 14.26s/it]Train:  19%|█▉        | 968/5000 [3:50:32<15:58:29, 14.26s/it]Train:  19%|█▉        | 969/5000 [3:50:46<15:58:03, 14.26s/it]Train:  19%|█▉        | 970/5000 [3:51:01<15:58:19, 14.27s/it]                                                              {'loss': 1.78222809, 'token_acc': 0.611257, 'grad_norm': 0.252094, 'learning_rate': 1.889e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069904, 'epoch': 0.19, 'global_step/max_steps': '970/5000', 'percentage': '19.40%', 'elapsed_time': '3h 51m 1s', 'remaining_time': '15h 59m 48s'}
+Train:  19%|█▉        | 970/5000 [3:51:01<15:58:19, 14.27s/it]Train:  19%|█▉        | 970/5000 [3:51:01<15:58:19, 14.27s/it]Train:  19%|█▉        | 971/5000 [3:51:15<15:58:25, 14.27s/it]Train:  19%|█▉        | 972/5000 [3:51:29<15:57:36, 14.26s/it]Train:  19%|█▉        | 973/5000 [3:51:44<15:57:37, 14.27s/it]Train:  19%|█▉        | 974/5000 [3:51:58<15:57:25, 14.27s/it]Train:  20%|█▉        | 975/5000 [3:52:12<15:57:27, 14.27s/it]Train:  20%|█▉        | 976/5000 [3:52:26<15:57:44, 14.28s/it]Train:  20%|█▉        | 977/5000 [3:52:41<15:57:08, 14.28s/it]Train:  20%|█▉        | 978/5000 [3:52:55<15:56:00, 14.26s/it]Train:  20%|█▉        | 979/5000 [3:53:09<15:55:37, 14.26s/it]Train:  20%|█▉        | 980/5000 [3:53:23<15:55:49, 14.27s/it]                                                              {'loss': 1.79098339, 'token_acc': 0.61782099, 'grad_norm': 0.2651937, 'learning_rate': 1.886e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069906, 'epoch': 0.2, 'global_step/max_steps': '980/5000', 'percentage': '19.60%', 'elapsed_time': '3h 53m 23s', 'remaining_time': '15h 57m 24s'}
+Train:  20%|█▉        | 980/5000 [3:53:23<15:55:49, 14.27s/it]Train:  20%|█▉        | 980/5000 [3:53:23<15:55:49, 14.27s/it]Train:  20%|█▉        | 981/5000 [3:53:38<15:55:53, 14.27s/it]Train:  20%|█▉        | 982/5000 [3:53:52<15:55:36, 14.27s/it]Train:  20%|█▉        | 983/5000 [3:54:06<15:55:25, 14.27s/it]Train:  20%|█▉        | 984/5000 [3:54:20<15:54:21, 14.26s/it]Train:  20%|█▉        | 985/5000 [3:54:35<15:54:23, 14.26s/it]Train:  20%|█▉        | 986/5000 [3:54:49<15:54:44, 14.27s/it]Train:  20%|█▉        | 987/5000 [3:55:03<15:53:39, 14.26s/it]Train:  20%|█▉        | 988/5000 [3:55:17<15:53:32, 14.26s/it]Train:  20%|█▉        | 989/5000 [3:55:32<15:53:34, 14.26s/it]Train:  20%|█▉        | 990/5000 [3:55:46<15:54:03, 14.28s/it]                                                              {'loss': 1.78043766, 'token_acc': 0.61179713, 'grad_norm': 0.26628599, 'learning_rate': 1.883e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.069908, 'epoch': 0.2, 'global_step/max_steps': '990/5000', 'percentage': '19.80%', 'elapsed_time': '3h 55m 46s', 'remaining_time': '15h 55m 0s'}
+Train:  20%|█▉        | 990/5000 [3:55:46<15:54:03, 14.28s/it]Train:  20%|█▉        | 990/5000 [3:55:46<15:54:03, 14.28s/it]Train:  20%|█▉        | 991/5000 [3:56:00<15:52:58, 14.26s/it]Train:  20%|█▉        | 992/5000 [3:56:15<15:51:46, 14.25s/it]Train:  20%|█▉        | 993/5000 [3:56:29<15:52:44, 14.27s/it]Train:  20%|█▉        | 994/5000 [3:56:43<15:52:43, 14.27s/it]Train:  20%|█▉        | 995/5000 [3:56:57<15:53:27, 14.28s/it]Train:  20%|█▉        | 996/5000 [3:57:12<15:51:46, 14.26s/it]Train:  20%|█▉        | 997/5000 [3:57:26<15:51:42, 14.26s/it]Train:  20%|█▉        | 998/5000 [3:57:40<15:52:10, 14.28s/it]Train:  20%|█▉        | 999/5000 [3:57:54<15:50:44, 14.26s/it]Train:  20%|██        | 1000/5000 [3:58:09<15:49:47, 14.25s/it]                                                               {'loss': 1.77623749, 'token_acc': 0.61425036, 'grad_norm': 0.26637572, 'learning_rate': 1.879e-05, 'memory(GiB)': 126.46, 'train_speed(iter/s)': 0.06991, 'epoch': 0.2, 'global_step/max_steps': '1000/5000', 'percentage': '20.00%', 'elapsed_time': '3h 58m 9s', 'remaining_time': '15h 52m 36s'}
+Train:  20%|██        | 1000/5000 [3:58:09<15:49:47, 14.25s/it]Train:  20%|██        | 1000/5000 [3:58:09<15:49:47, 14.25s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+Token indices sequence length is longer than the specified maximum sequence length for this model (173093 > 131072). Running this sequence through the model will result in indexing errors
+                                                               {'eval_loss': 1.48531151, 'eval_token_acc': 0.65717447, 'eval_runtime': 42.1731, 'eval_samples_per_second': 0.332, 'eval_steps_per_second': 0.024, 'epoch': 0.2, 'global_step/max_steps': '1000/5000', 'percentage': '20.00%', 'elapsed_time': '3h 58m 51s', 'remaining_time': '15h 55m 25s'}
+Train:  20%|██        | 1000/5000 [3:58:51<15:49:47, 14.25s/it]Train:  20%|██        | 1000/5000 [3:58:51<15:49:47, 14.25s/it][INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/checkpoint-1000
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  20%|██        | 1001/5000 [3:59:52<45:33:25, 41.01s/it]Train:  20%|██        | 1002/5000 [4:00:06<36:37:50, 32.98s/it]Train:  20%|██        | 1003/5000 [4:00:21<30:24:28, 27.39s/it]Train:  20%|██        | 1004/5000 [4:00:35<26:02:58, 23.47s/it]Train:  20%|██        | 1005/5000 [4:00:49<23:01:23, 20.75s/it]Train:  20%|██        | 1006/5000 [4:01:04<20:54:02, 18.84s/it]Train:  20%|██        | 1007/5000 [4:01:18<19:24:51, 17.50s/it]Train:  20%|██        | 1008/5000 [4:01:33<18:22:00, 16.56s/it]Train:  20%|██        | 1009/5000 [4:01:47<17:37:59, 15.91s/it]Train:  20%|██        | 1010/5000 [4:02:01<17:06:14, 15.43s/it]                                                               {'loss': 1.75486431, 'token_acc': 0.6226663, 'grad_norm': 0.25502607, 'learning_rate': 1.876e-05, 'memory(GiB)': 129.44, 'train_speed(iter/s)': 0.069479, 'epoch': 0.2, 'global_step/max_steps': '1010/5000', 'percentage': '20.20%', 'elapsed_time': '4h 2m 1s', 'remaining_time': '15h 56m 8s'}
+Train:  20%|██        | 1010/5000 [4:02:01<17:06:14, 15.43s/it]Train:  20%|██        | 1010/5000 [4:02:01<17:06:14, 15.43s/it]Train:  20%|██        | 1011/5000 [4:02:16<16:44:24, 15.11s/it]Train:  20%|██        | 1012/5000 [4:02:30<16:28:25, 14.87s/it]Train:  20%|██        | 1013/5000 [4:02:44<16:17:45, 14.71s/it]Train:  20%|██        | 1014/5000 [4:02:59<16:10:12, 14.60s/it]Train:  20%|██        | 1015/5000 [4:03:13<16:03:56, 14.51s/it]Train:  20%|██        | 1016/5000 [4:03:27<15:59:39, 14.45s/it]Train:  20%|██        | 1017/5000 [4:03:42<15:57:09, 14.42s/it]Train:  20%|██        | 1018/5000 [4:03:56<15:55:25, 14.40s/it]Train:  20%|██        | 1019/5000 [4:04:10<15:53:10, 14.37s/it]Train:  20%|██        | 1020/5000 [4:04:24<15:50:36, 14.33s/it]                                                               {'loss': 1.77758064, 'token_acc': 0.61914479, 'grad_norm': 0.24773309, 'learning_rate': 1.873e-05, 'memory(GiB)': 129.48, 'train_speed(iter/s)': 0.069483, 'epoch': 0.2, 'global_step/max_steps': '1020/5000', 'percentage': '20.40%', 'elapsed_time': '4h 4m 24s', 'remaining_time': '15h 53m 42s'}
+Train:  20%|██        | 1020/5000 [4:04:24<15:50:36, 14.33s/it]Train:  20%|██        | 1020/5000 [4:04:24<15:50:36, 14.33s/it]Train:  20%|██        | 1021/5000 [4:04:39<15:49:53, 14.32s/it]Train:  20%|██        | 1022/5000 [4:04:53<15:49:03, 14.31s/it]Train:  20%|██        | 1023/5000 [4:05:07<15:48:22, 14.31s/it]Train:  20%|██        | 1024/5000 [4:05:22<15:46:57, 14.29s/it]Train:  20%|██        | 1025/5000 [4:05:36<15:46:48, 14.29s/it]Train:  21%|██        | 1026/5000 [4:05:50<15:46:37, 14.29s/it]Train:  21%|██        | 1027/5000 [4:06:04<15:45:53, 14.28s/it]Train:  21%|██        | 1028/5000 [4:06:19<15:46:51, 14.30s/it]Train:  21%|██        | 1029/5000 [4:06:33<15:46:22, 14.30s/it]Train:  21%|██        | 1030/5000 [4:06:47<15:46:27, 14.30s/it]                                                               {'loss': 1.76901321, 'token_acc': 0.61926109, 'grad_norm': 0.24070834, 'learning_rate': 1.87e-05, 'memory(GiB)': 129.48, 'train_speed(iter/s)': 0.069487, 'epoch': 0.21, 'global_step/max_steps': '1030/5000', 'percentage': '20.60%', 'elapsed_time': '4h 6m 47s', 'remaining_time': '15h 51m 15s'}
+Train:  21%|██        | 1030/5000 [4:06:47<15:46:27, 14.30s/it]Train:  21%|██        | 1030/5000 [4:06:47<15:46:27, 14.30s/it]Train:  21%|██        | 1031/5000 [4:07:02<15:45:37, 14.30s/it]Train:  21%|██        | 1032/5000 [4:07:16<15:45:39, 14.30s/it]Train:  21%|██        | 1033/5000 [4:07:30<15:45:56, 14.31s/it]Train:  21%|██        | 1034/5000 [4:07:45<15:45:41, 14.31s/it]Train:  21%|██        | 1035/5000 [4:07:59<15:44:44, 14.30s/it]Train:  21%|██        | 1036/5000 [4:08:13<15:44:26, 14.30s/it]Train:  21%|██        | 1037/5000 [4:08:27<15:43:32, 14.29s/it]Train:  21%|██        | 1038/5000 [4:08:42<15:43:58, 14.30s/it]Train:  21%|██        | 1039/5000 [4:08:56<15:43:10, 14.29s/it]Train:  21%|██        | 1040/5000 [4:09:10<15:42:04, 14.27s/it]                                                               {'loss': 1.77395554, 'token_acc': 0.61445724, 'grad_norm': 0.24869798, 'learning_rate': 1.867e-05, 'memory(GiB)': 129.48, 'train_speed(iter/s)': 0.069492, 'epoch': 0.21, 'global_step/max_steps': '1040/5000', 'percentage': '20.80%', 'elapsed_time': '4h 9m 10s', 'remaining_time': '15h 48m 47s'}
+Train:  21%|██        | 1040/5000 [4:09:10<15:42:04, 14.27s/it]Train:  21%|██        | 1040/5000 [4:09:10<15:42:04, 14.27s/it]Train:  21%|██        | 1041/5000 [4:09:25<15:42:33, 14.28s/it]Train:  21%|██        | 1042/5000 [4:09:39<15:41:18, 14.27s/it]Train:  21%|██        | 1043/5000 [4:09:53<15:41:14, 14.27s/it]Train:  21%|██        | 1044/5000 [4:10:07<15:41:30, 14.28s/it]Train:  21%|██        | 1045/5000 [4:10:22<15:41:23, 14.28s/it]Train:  21%|██        | 1046/5000 [4:10:36<15:41:29, 14.29s/it]Train:  21%|██        | 1047/5000 [4:10:50<15:40:02, 14.27s/it]Train:  21%|██        | 1048/5000 [4:11:04<15:39:25, 14.26s/it]Train:  21%|██        | 1049/5000 [4:11:19<15:38:34, 14.25s/it]Train:  21%|██        | 1050/5000 [4:11:33<15:38:25, 14.25s/it]                                                               {'loss': 1.78150101, 'token_acc': 0.61456345, 'grad_norm': 0.28278297, 'learning_rate': 1.863e-05, 'memory(GiB)': 129.48, 'train_speed(iter/s)': 0.069498, 'epoch': 0.21, 'global_step/max_steps': '1050/5000', 'percentage': '21.00%', 'elapsed_time': '4h 11m 33s', 'remaining_time': '15h 46m 20s'}
+Train:  21%|██        | 1050/5000 [4:11:33<15:38:25, 14.25s/it]Train:  21%|██        | 1050/5000 [4:11:33<15:38:25, 14.25s/it]Train:  21%|██        | 1051/5000 [4:11:47<15:38:48, 14.26s/it]Train:  21%|██        | 1052/5000 [4:12:02<15:38:57, 14.27s/it]Train:  21%|██        | 1053/5000 [4:12:16<15:38:10, 14.26s/it]Train:  21%|██        | 1054/5000 [4:12:30<15:38:51, 14.28s/it]Train:  21%|██        | 1055/5000 [4:12:44<15:38:18, 14.27s/it]Train:  21%|██        | 1056/5000 [4:12:59<15:37:59, 14.27s/it]Train:  21%|██        | 1057/5000 [4:13:13<15:38:15, 14.28s/it]Train:  21%|██        | 1058/5000 [4:13:27<15:37:44, 14.27s/it]Train:  21%|██        | 1059/5000 [4:13:41<15:37:48, 14.28s/it]Train:  21%|██        | 1060/5000 [4:13:56<15:37:28, 14.28s/it]                                                               {'loss': 1.7722971, 'token_acc': 0.61059023, 'grad_norm': 0.25826696, 'learning_rate': 1.86e-05, 'memory(GiB)': 129.48, 'train_speed(iter/s)': 0.069503, 'epoch': 0.21, 'global_step/max_steps': '1060/5000', 'percentage': '21.20%', 'elapsed_time': '4h 13m 56s', 'remaining_time': '15h 43m 52s'}
+Train:  21%|██        | 1060/5000 [4:13:56<15:37:28, 14.28s/it]Train:  21%|██        | 1060/5000 [4:13:56<15:37:28, 14.28s/it]Train:  21%|██        | 1061/5000 [4:14:10<15:36:29, 14.26s/it]Train:  21%|██        | 1062/5000 [4:14:24<15:35:43, 14.26s/it]Train:  21%|██▏       | 1063/5000 [4:14:38<15:35:36, 14.26s/it]Train:  21%|██▏       | 1064/5000 [4:14:53<15:34:57, 14.25s/it]Train:  21%|██▏       | 1065/5000 [4:15:07<15:36:28, 14.28s/it]Train:  21%|██▏       | 1066/5000 [4:15:21<15:35:20, 14.27s/it]Train:  21%|██▏       | 1067/5000 [4:15:36<15:36:06, 14.28s/it]Train:  21%|██▏       | 1068/5000 [4:15:50<15:36:19, 14.29s/it]Train:  21%|██▏       | 1069/5000 [4:16:04<15:35:25, 14.28s/it]Train:  21%|██▏       | 1070/5000 [4:16:18<15:35:17, 14.28s/it]                                                               {'loss': 1.77516136, 'token_acc': 0.6180243, 'grad_norm': 0.25667128, 'learning_rate': 1.857e-05, 'memory(GiB)': 129.5, 'train_speed(iter/s)': 0.069508, 'epoch': 0.21, 'global_step/max_steps': '1070/5000', 'percentage': '21.40%', 'elapsed_time': '4h 16m 18s', 'remaining_time': '15h 41m 25s'}
+Train:  21%|██▏       | 1070/5000 [4:16:18<15:35:17, 14.28s/it]Train:  21%|██▏       | 1070/5000 [4:16:18<15:35:17, 14.28s/it]Train:  21%|██▏       | 1071/5000 [4:16:33<15:35:17, 14.28s/it]Train:  21%|██▏       | 1072/5000 [4:16:47<15:34:39, 14.28s/it]Train:  21%|██▏       | 1073/5000 [4:17:01<15:34:59, 14.29s/it]Train:  21%|██▏       | 1074/5000 [4:17:16<15:33:39, 14.27s/it]Train:  22%|██▏       | 1075/5000 [4:17:30<15:33:39, 14.27s/it]Train:  22%|██▏       | 1076/5000 [4:17:44<15:33:39, 14.28s/it]Train:  22%|██▏       | 1077/5000 [4:17:58<15:33:59, 14.28s/it]Train:  22%|██▏       | 1078/5000 [4:18:13<15:33:56, 14.29s/it]Train:  22%|██▏       | 1079/5000 [4:18:27<15:33:12, 14.28s/it]Train:  22%|██▏       | 1080/5000 [4:18:41<15:33:25, 14.29s/it]                                                               {'loss': 1.75914536, 'token_acc': 0.6170802, 'grad_norm': 0.25736195, 'learning_rate': 1.853e-05, 'memory(GiB)': 129.5, 'train_speed(iter/s)': 0.069513, 'epoch': 0.22, 'global_step/max_steps': '1080/5000', 'percentage': '21.60%', 'elapsed_time': '4h 18m 41s', 'remaining_time': '15h 38m 58s'}
+Train:  22%|██▏       | 1080/5000 [4:18:41<15:33:25, 14.29s/it]Train:  22%|██▏       | 1080/5000 [4:18:41<15:33:25, 14.29s/it]Train:  22%|██▏       | 1081/5000 [4:18:56<15:32:50, 14.28s/it]Train:  22%|██▏       | 1082/5000 [4:19:10<15:32:24, 14.28s/it]Train:  22%|██▏       | 1083/5000 [4:19:24<15:31:51, 14.27s/it]Train:  22%|██▏       | 1084/5000 [4:19:38<15:31:14, 14.27s/it]Train:  22%|██▏       | 1085/5000 [4:19:53<15:31:14, 14.27s/it]Train:  22%|██▏       | 1086/5000 [4:20:07<15:31:08, 14.27s/it]Train:  22%|██▏       | 1087/5000 [4:20:21<15:30:24, 14.27s/it]Train:  22%|██▏       | 1088/5000 [4:20:35<15:29:46, 14.26s/it]Train:  22%|██▏       | 1089/5000 [4:20:50<15:29:25, 14.26s/it]Train:  22%|██▏       | 1090/5000 [4:21:04<15:28:44, 14.25s/it]                                                               {'loss': 1.77400818, 'token_acc': 0.61384772, 'grad_norm': 0.26768196, 'learning_rate': 1.85e-05, 'memory(GiB)': 129.5, 'train_speed(iter/s)': 0.069519, 'epoch': 0.22, 'global_step/max_steps': '1090/5000', 'percentage': '21.80%', 'elapsed_time': '4h 21m 4s', 'remaining_time': '15h 36m 30s'}
+Train:  22%|██▏       | 1090/5000 [4:21:04<15:28:44, 14.25s/it]Train:  22%|██▏       | 1090/5000 [4:21:04<15:28:44, 14.25s/it]Train:  22%|██▏       | 1091/5000 [4:21:18<15:28:25, 14.25s/it]Train:  22%|██▏       | 1092/5000 [4:21:32<15:29:02, 14.26s/it]Train:  22%|██▏       | 1093/5000 [4:21:47<15:28:41, 14.26s/it]Train:  22%|██▏       | 1094/5000 [4:22:01<15:29:03, 14.27s/it]Train:  22%|██▏       | 1095/5000 [4:22:15<15:27:48, 14.26s/it]Train:  22%|██▏       | 1096/5000 [4:22:29<15:27:42, 14.26s/it]Train:  22%|██▏       | 1097/5000 [4:22:44<15:27:17, 14.26s/it]Train:  22%|██▏       | 1098/5000 [4:22:58<15:26:36, 14.25s/it]Train:  22%|██▏       | 1099/5000 [4:23:12<15:26:59, 14.26s/it]Train:  22%|██▏       | 1100/5000 [4:23:26<15:26:08, 14.25s/it]                                                               {'loss': 1.7675005, 'token_acc': 0.61241105, 'grad_norm': 0.24824873, 'learning_rate': 1.846e-05, 'memory(GiB)': 129.5, 'train_speed(iter/s)': 0.069524, 'epoch': 0.22, 'global_step/max_steps': '1100/5000', 'percentage': '22.00%', 'elapsed_time': '4h 23m 26s', 'remaining_time': '15h 34m 2s'}
+Train:  22%|██▏       | 1100/5000 [4:23:26<15:26:08, 14.25s/it]Train:  22%|██▏       | 1100/5000 [4:23:26<15:26:08, 14.25s/it]Train:  22%|██▏       | 1101/5000 [4:23:41<15:26:08, 14.25s/it]Train:  22%|██▏       | 1102/5000 [4:23:55<15:27:01, 14.27s/it]Train:  22%|██▏       | 1103/5000 [4:24:09<15:26:44, 14.27s/it]Train:  22%|██▏       | 1104/5000 [4:24:24<15:27:46, 14.29s/it]Train:  22%|██▏       | 1105/5000 [4:24:38<15:27:23, 14.29s/it]Train:  22%|██▏       | 1106/5000 [4:24:52<15:26:17, 14.27s/it]Train:  22%|██▏       | 1107/5000 [4:25:06<15:26:19, 14.28s/it]Train:  22%|██▏       | 1108/5000 [4:25:21<15:27:01, 14.29s/it]Train:  22%|██▏       | 1109/5000 [4:25:35<15:27:24, 14.30s/it]Train:  22%|██▏       | 1110/5000 [4:25:49<15:26:21, 14.29s/it]                                                               {'loss': 1.76859436, 'token_acc': 0.61582484, 'grad_norm': 0.25876161, 'learning_rate': 1.843e-05, 'memory(GiB)': 129.5, 'train_speed(iter/s)': 0.069528, 'epoch': 0.22, 'global_step/max_steps': '1110/5000', 'percentage': '22.20%', 'elapsed_time': '4h 25m 49s', 'remaining_time': '15h 31m 36s'}
+Train:  22%|██▏       | 1110/5000 [4:25:49<15:26:21, 14.29s/it]Train:  22%|██▏       | 1110/5000 [4:25:49<15:26:21, 14.29s/it]Train:  22%|██▏       | 1111/5000 [4:26:04<15:26:07, 14.29s/it]Train:  22%|██▏       | 1112/5000 [4:26:18<15:25:49, 14.29s/it]Train:  22%|██▏       | 1113/5000 [4:26:32<15:25:06, 14.28s/it]Train:  22%|██▏       | 1114/5000 [4:26:46<15:25:24, 14.29s/it]Train:  22%|██▏       | 1115/5000 [4:27:01<15:24:41, 14.28s/it]Train:  22%|██▏       | 1116/5000 [4:27:15<15:23:48, 14.27s/it]Train:  22%|██▏       | 1117/5000 [4:27:29<15:23:21, 14.27s/it]Train:  22%|██▏       | 1118/5000 [4:27:43<15:23:27, 14.27s/it]Train:  22%|██▏       | 1119/5000 [4:27:58<15:23:01, 14.27s/it]Train:  22%|██▏       | 1120/5000 [4:28:12<15:23:23, 14.28s/it]                                                               {'loss': 1.77011223, 'token_acc': 0.61350843, 'grad_norm': 0.26804185, 'learning_rate': 1.839e-05, 'memory(GiB)': 129.5, 'train_speed(iter/s)': 0.069533, 'epoch': 0.22, 'global_step/max_steps': '1120/5000', 'percentage': '22.40%', 'elapsed_time': '4h 28m 12s', 'remaining_time': '15h 29m 9s'}
+Train:  22%|██▏       | 1120/5000 [4:28:12<15:23:23, 14.28s/it]Train:  22%|██▏       | 1120/5000 [4:28:12<15:23:23, 14.28s/it]Train:  22%|██▏       | 1121/5000 [4:28:26<15:23:03, 14.28s/it]Train:  22%|██▏       | 1122/5000 [4:28:41<15:23:11, 14.28s/it]Train:  22%|██▏       | 1123/5000 [4:28:55<15:22:42, 14.28s/it]Train:  22%|██▏       | 1124/5000 [4:29:09<15:21:53, 14.27s/it]Train:  22%|██▎       | 1125/5000 [4:29:23<15:22:43, 14.29s/it]Train:  23%|██▎       | 1126/5000 [4:29:38<15:22:33, 14.29s/it]Train:  23%|██▎       | 1127/5000 [4:29:52<15:22:30, 14.29s/it]Train:  23%|██▎       | 1128/5000 [4:30:06<15:22:05, 14.29s/it]Train:  23%|██▎       | 1129/5000 [4:30:21<15:21:25, 14.28s/it]Train:  23%|██▎       | 1130/5000 [4:30:35<15:22:01, 14.30s/it]                                                               {'loss': 1.76063995, 'token_acc': 0.61619544, 'grad_norm': 0.25026691, 'learning_rate': 1.835e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069537, 'epoch': 0.23, 'global_step/max_steps': '1130/5000', 'percentage': '22.60%', 'elapsed_time': '4h 30m 35s', 'remaining_time': '15h 26m 42s'}
+Train:  23%|██▎       | 1130/5000 [4:30:35<15:22:01, 14.30s/it]Train:  23%|██▎       | 1130/5000 [4:30:35<15:22:01, 14.30s/it]Train:  23%|██▎       | 1131/5000 [4:30:49<15:21:08, 14.29s/it]Train:  23%|██▎       | 1132/5000 [4:31:03<15:21:00, 14.29s/it]Train:  23%|██▎       | 1133/5000 [4:31:18<15:20:57, 14.29s/it]Train:  23%|██▎       | 1134/5000 [4:31:32<15:20:14, 14.28s/it]Train:  23%|██▎       | 1135/5000 [4:31:46<15:19:38, 14.28s/it]Train:  23%|██▎       | 1136/5000 [4:32:01<15:19:37, 14.28s/it]Train:  23%|██▎       | 1137/5000 [4:32:15<15:19:37, 14.28s/it]Train:  23%|██▎       | 1138/5000 [4:32:29<15:18:57, 14.28s/it]Train:  23%|██▎       | 1139/5000 [4:32:43<15:18:21, 14.27s/it]Train:  23%|██▎       | 1140/5000 [4:32:58<15:18:26, 14.28s/it]                                                               {'loss': 1.76801605, 'token_acc': 0.61456252, 'grad_norm': 0.2581383, 'learning_rate': 1.832e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069541, 'epoch': 0.23, 'global_step/max_steps': '1140/5000', 'percentage': '22.80%', 'elapsed_time': '4h 32m 58s', 'remaining_time': '15h 24m 16s'}
+Train:  23%|██▎       | 1140/5000 [4:32:58<15:18:26, 14.28s/it]Train:  23%|██▎       | 1140/5000 [4:32:58<15:18:26, 14.28s/it]Train:  23%|██▎       | 1141/5000 [4:33:12<15:18:03, 14.27s/it]Train:  23%|██▎       | 1142/5000 [4:33:26<15:17:44, 14.27s/it]Train:  23%|██▎       | 1143/5000 [4:33:41<15:17:52, 14.28s/it]Train:  23%|██▎       | 1144/5000 [4:33:55<15:17:27, 14.28s/it]Train:  23%|██▎       | 1145/5000 [4:34:09<15:17:14, 14.28s/it]Train:  23%|██▎       | 1146/5000 [4:34:23<15:16:47, 14.27s/it]Train:  23%|██▎       | 1147/5000 [4:34:38<15:15:52, 14.26s/it]Train:  23%|██▎       | 1148/5000 [4:34:52<15:16:26, 14.27s/it]Train:  23%|██▎       | 1149/5000 [4:35:06<15:16:28, 14.28s/it]Train:  23%|██▎       | 1150/5000 [4:35:20<15:15:23, 14.27s/it]                                                               {'loss': 1.78214188, 'token_acc': 0.61122081, 'grad_norm': 0.26864767, 'learning_rate': 1.828e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069546, 'epoch': 0.23, 'global_step/max_steps': '1150/5000', 'percentage': '23.00%', 'elapsed_time': '4h 35m 20s', 'remaining_time': '15h 21m 49s'}
+Train:  23%|██▎       | 1150/5000 [4:35:20<15:15:23, 14.27s/it]Train:  23%|██▎       | 1150/5000 [4:35:20<15:15:23, 14.27s/it]Train:  23%|██▎       | 1151/5000 [4:35:35<15:14:46, 14.26s/it]Train:  23%|██▎       | 1152/5000 [4:35:49<15:14:50, 14.26s/it]Train:  23%|██▎       | 1153/5000 [4:36:03<15:15:10, 14.27s/it]Train:  23%|██▎       | 1154/5000 [4:36:18<15:15:01, 14.28s/it]Train:  23%|██▎       | 1155/5000 [4:36:32<15:14:50, 14.28s/it]Train:  23%|██▎       | 1156/5000 [4:36:46<15:15:22, 14.29s/it]Train:  23%|██▎       | 1157/5000 [4:37:00<15:14:14, 14.27s/it]Train:  23%|██▎       | 1158/5000 [4:37:15<15:14:07, 14.28s/it]Train:  23%|██▎       | 1159/5000 [4:37:29<15:13:59, 14.28s/it]Train:  23%|██▎       | 1160/5000 [4:37:43<15:13:46, 14.28s/it]                                                               {'loss': 1.7716568, 'token_acc': 0.61621171, 'grad_norm': 0.25307283, 'learning_rate': 1.824e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06955, 'epoch': 0.23, 'global_step/max_steps': '1160/5000', 'percentage': '23.20%', 'elapsed_time': '4h 37m 43s', 'remaining_time': '15h 19m 22s'}
+Train:  23%|██▎       | 1160/5000 [4:37:43<15:13:46, 14.28s/it]Train:  23%|██▎       | 1160/5000 [4:37:43<15:13:46, 14.28s/it]Train:  23%|██▎       | 1161/5000 [4:37:58<15:14:38, 14.29s/it]Train:  23%|██▎       | 1162/5000 [4:38:12<15:13:43, 14.28s/it]Train:  23%|██▎       | 1163/5000 [4:38:26<15:12:31, 14.27s/it]Train:  23%|██▎       | 1164/5000 [4:38:40<15:12:30, 14.27s/it]Train:  23%|██▎       | 1165/5000 [4:38:55<15:11:28, 14.26s/it]Train:  23%|██▎       | 1166/5000 [4:39:09<15:11:29, 14.26s/it]Train:  23%|██▎       | 1167/5000 [4:39:23<15:12:25, 14.28s/it]Train:  23%|██▎       | 1168/5000 [4:39:37<15:11:35, 14.27s/it]Train:  23%|██▎       | 1169/5000 [4:39:52<15:10:37, 14.26s/it]Train:  23%|██▎       | 1170/5000 [4:40:06<15:10:05, 14.26s/it]                                                               {'loss': 1.77270546, 'token_acc': 0.61932218, 'grad_norm': 0.25487307, 'learning_rate': 1.821e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069555, 'epoch': 0.23, 'global_step/max_steps': '1170/5000', 'percentage': '23.40%', 'elapsed_time': '4h 40m 6s', 'remaining_time': '15h 16m 55s'}
+Train:  23%|██▎       | 1170/5000 [4:40:06<15:10:05, 14.26s/it]Train:  23%|██▎       | 1170/5000 [4:40:06<15:10:05, 14.26s/it]Train:  23%|██▎       | 1171/5000 [4:40:20<15:10:02, 14.26s/it]Train:  23%|██▎       | 1172/5000 [4:40:34<15:11:23, 14.29s/it]Train:  23%|██▎       | 1173/5000 [4:40:49<15:11:18, 14.29s/it]Train:  23%|██▎       | 1174/5000 [4:41:03<15:11:17, 14.29s/it]Train:  24%|██▎       | 1175/5000 [4:41:17<15:10:33, 14.28s/it]Train:  24%|██▎       | 1176/5000 [4:41:32<15:10:22, 14.28s/it]Train:  24%|██▎       | 1177/5000 [4:41:46<15:09:28, 14.27s/it]Train:  24%|██▎       | 1178/5000 [4:42:00<15:09:20, 14.28s/it]Train:  24%|██▎       | 1179/5000 [4:42:14<15:09:39, 14.28s/it]Train:  24%|██▎       | 1180/5000 [4:42:29<15:09:38, 14.29s/it]                                                               {'loss': 1.76822033, 'token_acc': 0.61635899, 'grad_norm': 0.24068889, 'learning_rate': 1.817e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069558, 'epoch': 0.24, 'global_step/max_steps': '1180/5000', 'percentage': '23.60%', 'elapsed_time': '4h 42m 29s', 'remaining_time': '15h 14m 29s'}
+Train:  24%|██▎       | 1180/5000 [4:42:29<15:09:38, 14.29s/it]Train:  24%|██▎       | 1180/5000 [4:42:29<15:09:38, 14.29s/it]Train:  24%|██▎       | 1181/5000 [4:42:43<15:09:18, 14.29s/it]Train:  24%|██▎       | 1182/5000 [4:42:57<15:08:23, 14.28s/it]Train:  24%|██▎       | 1183/5000 [4:43:12<15:07:34, 14.27s/it]Train:  24%|██▎       | 1184/5000 [4:43:26<15:07:03, 14.26s/it]Train:  24%|██▎       | 1185/5000 [4:43:40<15:07:09, 14.27s/it]Train:  24%|██▎       | 1186/5000 [4:43:54<15:05:59, 14.25s/it]Train:  24%|██▎       | 1187/5000 [4:44:09<15:06:15, 14.26s/it]Train:  24%|██▍       | 1188/5000 [4:44:23<15:06:03, 14.26s/it]Train:  24%|██▍       | 1189/5000 [4:44:37<15:05:56, 14.26s/it]Train:  24%|██▍       | 1190/5000 [4:44:51<15:06:22, 14.27s/it]                                                               {'loss': 1.76744232, 'token_acc': 0.61915871, 'grad_norm': 0.25105804, 'learning_rate': 1.813e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069563, 'epoch': 0.24, 'global_step/max_steps': '1190/5000', 'percentage': '23.80%', 'elapsed_time': '4h 44m 51s', 'remaining_time': '15h 12m 2s'}
+Train:  24%|██▍       | 1190/5000 [4:44:51<15:06:22, 14.27s/it]Train:  24%|██▍       | 1190/5000 [4:44:51<15:06:22, 14.27s/it]Train:  24%|██▍       | 1191/5000 [4:45:06<15:07:00, 14.29s/it]Train:  24%|██▍       | 1192/5000 [4:45:20<15:06:30, 14.28s/it]Train:  24%|██▍       | 1193/5000 [4:45:34<15:05:43, 14.27s/it]Train:  24%|██▍       | 1194/5000 [4:45:48<15:05:21, 14.27s/it]Train:  24%|██▍       | 1195/5000 [4:46:03<15:05:42, 14.28s/it]Train:  24%|██▍       | 1196/5000 [4:46:17<15:05:47, 14.29s/it]Train:  24%|██▍       | 1197/5000 [4:46:31<15:05:22, 14.28s/it]Train:  24%|██▍       | 1198/5000 [4:46:46<15:05:47, 14.29s/it]Train:  24%|██▍       | 1199/5000 [4:47:00<15:05:17, 14.29s/it]Train:  24%|██▍       | 1200/5000 [4:47:14<15:04:23, 14.28s/it]                                                               {'loss': 1.77191639, 'token_acc': 0.61958271, 'grad_norm': 0.24179941, 'learning_rate': 1.809e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069567, 'epoch': 0.24, 'global_step/max_steps': '1200/5000', 'percentage': '24.00%', 'elapsed_time': '4h 47m 14s', 'remaining_time': '15h 9m 36s'}
+Train:  24%|██▍       | 1200/5000 [4:47:14<15:04:23, 14.28s/it]Train:  24%|██▍       | 1200/5000 [4:47:14<15:04:23, 14.28s/it]Train:  24%|██▍       | 1201/5000 [4:47:28<15:03:45, 14.27s/it]Train:  24%|██▍       | 1202/5000 [4:47:43<15:03:24, 14.27s/it]Train:  24%|██▍       | 1203/5000 [4:47:57<15:02:55, 14.27s/it]Train:  24%|██▍       | 1204/5000 [4:48:11<15:02:06, 14.26s/it]Train:  24%|██▍       | 1205/5000 [4:48:26<15:02:43, 14.27s/it]Train:  24%|██▍       | 1206/5000 [4:48:40<15:02:36, 14.27s/it]Train:  24%|██▍       | 1207/5000 [4:48:54<15:02:35, 14.28s/it]Train:  24%|██▍       | 1208/5000 [4:49:08<15:02:48, 14.28s/it]Train:  24%|██▍       | 1209/5000 [4:49:23<15:02:40, 14.29s/it]Train:  24%|██▍       | 1210/5000 [4:49:37<15:02:20, 14.29s/it]                                                               {'loss': 1.76131744, 'token_acc': 0.61845832, 'grad_norm': 0.25744972, 'learning_rate': 1.805e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069571, 'epoch': 0.24, 'global_step/max_steps': '1210/5000', 'percentage': '24.20%', 'elapsed_time': '4h 49m 37s', 'remaining_time': '15h 7m 10s'}
+Train:  24%|██▍       | 1210/5000 [4:49:37<15:02:20, 14.29s/it]Train:  24%|██▍       | 1210/5000 [4:49:37<15:02:20, 14.29s/it]Train:  24%|██▍       | 1211/5000 [4:49:51<15:02:14, 14.29s/it]Train:  24%|██▍       | 1212/5000 [4:50:06<15:01:55, 14.29s/it]Train:  24%|██▍       | 1213/5000 [4:50:20<15:01:20, 14.28s/it]Train:  24%|██▍       | 1214/5000 [4:50:34<15:01:17, 14.28s/it]Train:  24%|██▍       | 1215/5000 [4:50:48<15:01:17, 14.29s/it]Train:  24%|██▍       | 1216/5000 [4:51:03<15:00:10, 14.27s/it]Train:  24%|██▍       | 1217/5000 [4:51:17<14:59:40, 14.27s/it]Train:  24%|██▍       | 1218/5000 [4:51:31<14:59:49, 14.28s/it]Train:  24%|██▍       | 1219/5000 [4:51:45<14:59:06, 14.27s/it]Train:  24%|██▍       | 1220/5000 [4:52:00<14:59:31, 14.28s/it]                                                               {'loss': 1.76878052, 'token_acc': 0.61216017, 'grad_norm': 0.25516245, 'learning_rate': 1.801e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069574, 'epoch': 0.24, 'global_step/max_steps': '1220/5000', 'percentage': '24.40%', 'elapsed_time': '4h 52m 0s', 'remaining_time': '15h 4m 44s'}
+Train:  24%|██▍       | 1220/5000 [4:52:00<14:59:31, 14.28s/it]Train:  24%|██▍       | 1220/5000 [4:52:00<14:59:31, 14.28s/it]Train:  24%|██▍       | 1221/5000 [4:52:14<14:59:47, 14.29s/it]Train:  24%|██▍       | 1222/5000 [4:52:28<14:59:45, 14.29s/it]Train:  24%|██▍       | 1223/5000 [4:52:43<14:59:28, 14.29s/it]Train:  24%|██▍       | 1224/5000 [4:52:57<14:58:37, 14.28s/it]Train:  24%|██▍       | 1225/5000 [4:53:11<14:57:50, 14.27s/it]Train:  25%|██▍       | 1226/5000 [4:53:25<14:57:36, 14.27s/it]Train:  25%|██▍       | 1227/5000 [4:53:40<14:57:10, 14.27s/it]Train:  25%|██▍       | 1228/5000 [4:53:54<14:55:53, 14.25s/it]Train:  25%|██▍       | 1229/5000 [4:54:08<14:56:35, 14.27s/it]Train:  25%|██▍       | 1230/5000 [4:54:22<14:56:35, 14.27s/it]                                                               {'loss': 1.76321602, 'token_acc': 0.6151873, 'grad_norm': 0.26497772, 'learning_rate': 1.797e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069578, 'epoch': 0.25, 'global_step/max_steps': '1230/5000', 'percentage': '24.60%', 'elapsed_time': '4h 54m 22s', 'remaining_time': '15h 2m 17s'}
+Train:  25%|██▍       | 1230/5000 [4:54:22<14:56:35, 14.27s/it]Train:  25%|██▍       | 1230/5000 [4:54:22<14:56:35, 14.27s/it]Train:  25%|██▍       | 1231/5000 [4:54:37<14:56:39, 14.27s/it]Train:  25%|██▍       | 1232/5000 [4:54:51<14:56:23, 14.27s/it]Train:  25%|██▍       | 1233/5000 [4:55:05<14:57:15, 14.29s/it]Train:  25%|██▍       | 1234/5000 [4:55:20<14:56:59, 14.29s/it]Train:  25%|██▍       | 1235/5000 [4:55:34<14:57:16, 14.30s/it]Train:  25%|██▍       | 1236/5000 [4:55:48<14:57:11, 14.30s/it]Train:  25%|██▍       | 1237/5000 [4:56:03<14:56:59, 14.30s/it]Train:  25%|██▍       | 1238/5000 [4:56:17<14:56:26, 14.30s/it]Train:  25%|██▍       | 1239/5000 [4:56:31<14:55:52, 14.29s/it]Train:  25%|██▍       | 1240/5000 [4:56:45<14:54:32, 14.27s/it]                                                               {'loss': 1.75833874, 'token_acc': 0.61483952, 'grad_norm': 0.25204811, 'learning_rate': 1.793e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069582, 'epoch': 0.25, 'global_step/max_steps': '1240/5000', 'percentage': '24.80%', 'elapsed_time': '4h 56m 45s', 'remaining_time': '14h 59m 52s'}
+Train:  25%|██▍       | 1240/5000 [4:56:45<14:54:32, 14.27s/it]Train:  25%|██▍       | 1240/5000 [4:56:45<14:54:32, 14.27s/it]Train:  25%|██▍       | 1241/5000 [4:57:00<14:54:19, 14.28s/it]Train:  25%|██▍       | 1242/5000 [4:57:14<14:55:28, 14.30s/it]Train:  25%|██▍       | 1243/5000 [4:57:28<14:55:52, 14.31s/it]Train:  25%|██▍       | 1244/5000 [4:57:43<14:55:22, 14.30s/it]Train:  25%|██▍       | 1245/5000 [4:57:57<14:55:19, 14.31s/it]Train:  25%|██▍       | 1246/5000 [4:58:11<14:54:18, 14.29s/it]Train:  25%|██▍       | 1247/5000 [4:58:26<14:54:10, 14.30s/it]Train:  25%|██▍       | 1248/5000 [4:58:40<14:55:02, 14.31s/it]Train:  25%|██▍       | 1249/5000 [4:58:54<14:53:45, 14.30s/it]Train:  25%|██▌       | 1250/5000 [4:59:08<14:52:41, 14.28s/it]                                                               {'loss': 1.76690636, 'token_acc': 0.61920776, 'grad_norm': 0.24155301, 'learning_rate': 1.789e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069584, 'epoch': 0.25, 'global_step/max_steps': '1250/5000', 'percentage': '25.00%', 'elapsed_time': '4h 59m 8s', 'remaining_time': '14h 57m 26s'}
+Train:  25%|██▌       | 1250/5000 [4:59:08<14:52:41, 14.28s/it]Train:  25%|██▌       | 1250/5000 [4:59:08<14:52:41, 14.28s/it]Train:  25%|██▌       | 1251/5000 [4:59:23<14:52:04, 14.28s/it]Train:  25%|██▌       | 1252/5000 [4:59:37<14:51:56, 14.28s/it]Train:  25%|██▌       | 1253/5000 [4:59:51<14:52:10, 14.29s/it]Train:  25%|██▌       | 1254/5000 [5:00:05<14:51:42, 14.28s/it]Train:  25%|██▌       | 1255/5000 [5:00:20<14:51:13, 14.28s/it]Train:  25%|██▌       | 1256/5000 [5:00:34<14:51:58, 14.29s/it]Train:  25%|██▌       | 1257/5000 [5:00:48<14:51:37, 14.29s/it]Train:  25%|██▌       | 1258/5000 [5:01:03<14:51:09, 14.29s/it]Train:  25%|██▌       | 1259/5000 [5:01:17<14:51:05, 14.29s/it]Train:  25%|██▌       | 1260/5000 [5:01:31<14:50:56, 14.29s/it]                                                               {'loss': 1.7643858, 'token_acc': 0.6126958, 'grad_norm': 0.24922213, 'learning_rate': 1.785e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069588, 'epoch': 0.25, 'global_step/max_steps': '1260/5000', 'percentage': '25.20%', 'elapsed_time': '5h 1m 31s', 'remaining_time': '14h 55m 0s'}
+Train:  25%|██▌       | 1260/5000 [5:01:31<14:50:56, 14.29s/it]Train:  25%|██▌       | 1260/5000 [5:01:31<14:50:56, 14.29s/it]Train:  25%|██▌       | 1261/5000 [5:01:46<14:50:50, 14.30s/it]Train:  25%|██▌       | 1262/5000 [5:02:00<14:49:45, 14.28s/it]Train:  25%|██▌       | 1263/5000 [5:02:14<14:49:15, 14.28s/it]Train:  25%|██▌       | 1264/5000 [5:02:28<14:49:06, 14.28s/it]Train:  25%|██▌       | 1265/5000 [5:02:43<14:49:05, 14.28s/it]Train:  25%|██▌       | 1266/5000 [5:02:57<14:49:04, 14.29s/it]Train:  25%|██▌       | 1267/5000 [5:03:11<14:49:23, 14.30s/it]Train:  25%|██▌       | 1268/5000 [5:03:26<14:48:51, 14.29s/it]Train:  25%|██▌       | 1269/5000 [5:03:40<14:49:22, 14.30s/it]Train:  25%|██▌       | 1270/5000 [5:03:54<14:48:51, 14.30s/it]                                                               {'loss': 1.76771545, 'token_acc': 0.61051252, 'grad_norm': 0.24989091, 'learning_rate': 1.781e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069591, 'epoch': 0.25, 'global_step/max_steps': '1270/5000', 'percentage': '25.40%', 'elapsed_time': '5h 3m 54s', 'remaining_time': '14h 52m 35s'}
+Train:  25%|██▌       | 1270/5000 [5:03:54<14:48:51, 14.30s/it]Train:  25%|██▌       | 1270/5000 [5:03:54<14:48:51, 14.30s/it]Train:  25%|██▌       | 1271/5000 [5:04:08<14:48:08, 14.29s/it]Train:  25%|██▌       | 1272/5000 [5:04:23<14:48:15, 14.30s/it]Train:  25%|██▌       | 1273/5000 [5:04:37<14:48:13, 14.30s/it]Train:  25%|██▌       | 1274/5000 [5:04:51<14:47:19, 14.29s/it]Train:  26%|██▌       | 1275/5000 [5:05:06<14:47:13, 14.29s/it]Train:  26%|██▌       | 1276/5000 [5:05:20<14:47:05, 14.29s/it]Train:  26%|██▌       | 1277/5000 [5:05:34<14:46:21, 14.28s/it]Train:  26%|██▌       | 1278/5000 [5:05:48<14:46:33, 14.29s/it]Train:  26%|██▌       | 1279/5000 [5:06:03<14:46:16, 14.29s/it]Train:  26%|██▌       | 1280/5000 [5:06:17<14:45:26, 14.28s/it]                                                               {'loss': 1.76516953, 'token_acc': 0.62196304, 'grad_norm': 0.24770205, 'learning_rate': 1.777e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069594, 'epoch': 0.26, 'global_step/max_steps': '1280/5000', 'percentage': '25.60%', 'elapsed_time': '5h 6m 17s', 'remaining_time': '14h 50m 9s'}
+Train:  26%|██▌       | 1280/5000 [5:06:17<14:45:26, 14.28s/it]Train:  26%|██▌       | 1280/5000 [5:06:17<14:45:26, 14.28s/it]Train:  26%|██▌       | 1281/5000 [5:06:31<14:45:25, 14.28s/it]Train:  26%|██▌       | 1282/5000 [5:06:46<14:45:06, 14.28s/it]Train:  26%|██▌       | 1283/5000 [5:07:00<14:44:48, 14.28s/it]Train:  26%|██▌       | 1284/5000 [5:07:14<14:44:54, 14.29s/it]Train:  26%|██▌       | 1285/5000 [5:07:28<14:44:21, 14.28s/it]Train:  26%|██▌       | 1286/5000 [5:07:43<14:43:52, 14.28s/it]Train:  26%|██▌       | 1287/5000 [5:07:57<14:43:42, 14.28s/it]Train:  26%|██▌       | 1288/5000 [5:08:11<14:43:41, 14.28s/it]Train:  26%|██▌       | 1289/5000 [5:08:26<14:43:36, 14.29s/it]Train:  26%|██▌       | 1290/5000 [5:08:40<14:43:05, 14.28s/it]                                                               {'loss': 1.76363831, 'token_acc': 0.61777744, 'grad_norm': 0.24962428, 'learning_rate': 1.773e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069597, 'epoch': 0.26, 'global_step/max_steps': '1290/5000', 'percentage': '25.80%', 'elapsed_time': '5h 8m 40s', 'remaining_time': '14h 47m 43s'}
+Train:  26%|██▌       | 1290/5000 [5:08:40<14:43:05, 14.28s/it]Train:  26%|██▌       | 1290/5000 [5:08:40<14:43:05, 14.28s/it]Train:  26%|██▌       | 1291/5000 [5:08:54<14:42:35, 14.28s/it]Train:  26%|██▌       | 1292/5000 [5:09:08<14:42:20, 14.28s/it]Train:  26%|██▌       | 1293/5000 [5:09:23<14:42:26, 14.28s/it]Train:  26%|██▌       | 1294/5000 [5:09:37<14:42:19, 14.28s/it]Train:  26%|██▌       | 1295/5000 [5:09:51<14:41:41, 14.28s/it]Train:  26%|██▌       | 1296/5000 [5:10:06<14:41:56, 14.29s/it]Train:  26%|██▌       | 1297/5000 [5:10:20<14:41:02, 14.28s/it]Train:  26%|██▌       | 1298/5000 [5:10:34<14:40:25, 14.27s/it]Train:  26%|██▌       | 1299/5000 [5:10:48<14:39:58, 14.27s/it]Train:  26%|██▌       | 1300/5000 [5:11:03<14:40:29, 14.28s/it]                                                               {'loss': 1.77122917, 'token_acc': 0.61365547, 'grad_norm': 0.25279349, 'learning_rate': 1.768e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.0696, 'epoch': 0.26, 'global_step/max_steps': '1300/5000', 'percentage': '26.00%', 'elapsed_time': '5h 11m 3s', 'remaining_time': '14h 45m 18s'}
+Train:  26%|██▌       | 1300/5000 [5:11:03<14:40:29, 14.28s/it]Train:  26%|██▌       | 1300/5000 [5:11:03<14:40:29, 14.28s/it]Train:  26%|██▌       | 1301/5000 [5:11:17<14:40:03, 14.28s/it]Train:  26%|██▌       | 1302/5000 [5:11:31<14:40:25, 14.28s/it]Train:  26%|██▌       | 1303/5000 [5:11:45<14:40:12, 14.29s/it]Train:  26%|██▌       | 1304/5000 [5:12:00<14:38:57, 14.27s/it]Train:  26%|██▌       | 1305/5000 [5:12:14<14:38:47, 14.27s/it]Train:  26%|██▌       | 1306/5000 [5:12:28<14:38:43, 14.27s/it]Train:  26%|██▌       | 1307/5000 [5:12:43<14:37:46, 14.26s/it]Train:  26%|██▌       | 1308/5000 [5:12:57<14:37:51, 14.27s/it]Train:  26%|██▌       | 1309/5000 [5:13:11<14:37:29, 14.26s/it]Train:  26%|██▌       | 1310/5000 [5:13:25<14:37:24, 14.27s/it]                                                               {'loss': 1.76005306, 'token_acc': 0.61287885, 'grad_norm': 0.25481433, 'learning_rate': 1.764e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069604, 'epoch': 0.26, 'global_step/max_steps': '1310/5000', 'percentage': '26.20%', 'elapsed_time': '5h 13m 25s', 'remaining_time': '14h 42m 52s'}
+Train:  26%|██▌       | 1310/5000 [5:13:25<14:37:24, 14.27s/it]Train:  26%|██▌       | 1310/5000 [5:13:25<14:37:24, 14.27s/it]Train:  26%|██▌       | 1311/5000 [5:13:40<14:37:03, 14.26s/it]Train:  26%|██▌       | 1312/5000 [5:13:54<14:37:48, 14.28s/it]Train:  26%|██▋       | 1313/5000 [5:14:08<14:38:10, 14.29s/it]Train:  26%|██▋       | 1314/5000 [5:14:22<14:37:42, 14.29s/it]Train:  26%|██▋       | 1315/5000 [5:14:37<14:37:20, 14.29s/it]Train:  26%|██▋       | 1316/5000 [5:14:51<14:37:18, 14.29s/it]Train:  26%|██▋       | 1317/5000 [5:15:05<14:36:58, 14.29s/it]Train:  26%|██▋       | 1318/5000 [5:15:20<14:36:09, 14.28s/it]Train:  26%|██▋       | 1319/5000 [5:15:34<14:36:13, 14.28s/it]Train:  26%|██▋       | 1320/5000 [5:15:48<14:35:24, 14.27s/it]                                                               {'loss': 1.7587429, 'token_acc': 0.61730167, 'grad_norm': 0.24901739, 'learning_rate': 1.76e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069607, 'epoch': 0.26, 'global_step/max_steps': '1320/5000', 'percentage': '26.40%', 'elapsed_time': '5h 15m 48s', 'remaining_time': '14h 40m 26s'}
+Train:  26%|██▋       | 1320/5000 [5:15:48<14:35:24, 14.27s/it]Train:  26%|██▋       | 1320/5000 [5:15:48<14:35:24, 14.27s/it]Train:  26%|██▋       | 1321/5000 [5:16:02<14:36:20, 14.29s/it]Train:  26%|██▋       | 1322/5000 [5:16:17<14:35:10, 14.28s/it]Train:  26%|██▋       | 1323/5000 [5:16:31<14:35:15, 14.28s/it]Train:  26%|██▋       | 1324/5000 [5:16:45<14:35:00, 14.28s/it]Train:  26%|██▋       | 1325/5000 [5:17:00<14:34:31, 14.28s/it]Train:  27%|██▋       | 1326/5000 [5:17:14<14:33:29, 14.26s/it]Train:  27%|██▋       | 1327/5000 [5:17:28<14:33:32, 14.27s/it]Train:  27%|██▋       | 1328/5000 [5:17:42<14:33:20, 14.27s/it]Train:  27%|██▋       | 1329/5000 [5:17:57<14:33:35, 14.28s/it]Train:  27%|██▋       | 1330/5000 [5:18:11<14:33:04, 14.27s/it]                                                               {'loss': 1.75856133, 'token_acc': 0.61655486, 'grad_norm': 0.24961384, 'learning_rate': 1.756e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06961, 'epoch': 0.27, 'global_step/max_steps': '1330/5000', 'percentage': '26.60%', 'elapsed_time': '5h 18m 11s', 'remaining_time': '14h 38m 0s'}
+Train:  27%|██▋       | 1330/5000 [5:18:11<14:33:04, 14.27s/it]Train:  27%|██▋       | 1330/5000 [5:18:11<14:33:04, 14.27s/it]Train:  27%|██▋       | 1331/5000 [5:18:25<14:33:18, 14.28s/it]Train:  27%|██▋       | 1332/5000 [5:18:39<14:32:30, 14.27s/it]Train:  27%|██▋       | 1333/5000 [5:18:54<14:31:16, 14.26s/it]Train:  27%|██▋       | 1334/5000 [5:19:08<14:31:37, 14.27s/it]Train:  27%|██▋       | 1335/5000 [5:19:22<14:31:54, 14.27s/it]Train:  27%|██▋       | 1336/5000 [5:19:37<14:31:27, 14.27s/it]Train:  27%|██▋       | 1337/5000 [5:19:51<14:31:04, 14.27s/it]Train:  27%|██▋       | 1338/5000 [5:20:05<14:30:57, 14.27s/it]Train:  27%|██▋       | 1339/5000 [5:20:19<14:31:25, 14.28s/it]Train:  27%|██▋       | 1340/5000 [5:20:34<14:31:16, 14.28s/it]                                                               {'loss': 1.74849129, 'token_acc': 0.61977015, 'grad_norm': 0.25213054, 'learning_rate': 1.751e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069614, 'epoch': 0.27, 'global_step/max_steps': '1340/5000', 'percentage': '26.80%', 'elapsed_time': '5h 20m 34s', 'remaining_time': '14h 35m 35s'}
+Train:  27%|██▋       | 1340/5000 [5:20:34<14:31:16, 14.28s/it]Train:  27%|██▋       | 1340/5000 [5:20:34<14:31:16, 14.28s/it]Train:  27%|██▋       | 1341/5000 [5:20:48<14:30:23, 14.27s/it]Train:  27%|██▋       | 1342/5000 [5:21:02<14:29:56, 14.27s/it]Train:  27%|██▋       | 1343/5000 [5:21:16<14:29:20, 14.26s/it]Train:  27%|██▋       | 1344/5000 [5:21:31<14:28:38, 14.26s/it]Train:  27%|██▋       | 1345/5000 [5:21:45<14:29:04, 14.27s/it]Train:  27%|██▋       | 1346/5000 [5:21:59<14:28:54, 14.27s/it]Train:  27%|██▋       | 1347/5000 [5:22:14<14:29:08, 14.28s/it]Train:  27%|██▋       | 1348/5000 [5:22:28<14:28:27, 14.27s/it]Train:  27%|██▋       | 1349/5000 [5:22:42<14:27:50, 14.26s/it]Train:  27%|██▋       | 1350/5000 [5:22:56<14:28:07, 14.27s/it]                                                               {'loss': 1.76172314, 'token_acc': 0.61700676, 'grad_norm': 0.24809101, 'learning_rate': 1.747e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069617, 'epoch': 0.27, 'global_step/max_steps': '1350/5000', 'percentage': '27.00%', 'elapsed_time': '5h 22m 56s', 'remaining_time': '14h 33m 9s'}
+Train:  27%|██▋       | 1350/5000 [5:22:56<14:28:07, 14.27s/it]Train:  27%|██▋       | 1350/5000 [5:22:56<14:28:07, 14.27s/it]Train:  27%|██▋       | 1351/5000 [5:23:11<14:27:55, 14.27s/it]Train:  27%|██▋       | 1352/5000 [5:23:25<14:29:00, 14.29s/it]Train:  27%|██▋       | 1353/5000 [5:23:39<14:28:21, 14.29s/it]Train:  27%|██▋       | 1354/5000 [5:23:53<14:27:44, 14.28s/it]Train:  27%|██▋       | 1355/5000 [5:24:08<14:26:54, 14.27s/it]Train:  27%|██▋       | 1356/5000 [5:24:22<14:26:17, 14.26s/it]Train:  27%|██▋       | 1357/5000 [5:24:36<14:26:11, 14.27s/it]Train:  27%|██▋       | 1358/5000 [5:24:51<14:26:24, 14.27s/it]Train:  27%|██▋       | 1359/5000 [5:25:05<14:25:45, 14.27s/it]Train:  27%|██▋       | 1360/5000 [5:25:19<14:25:42, 14.27s/it]                                                               {'loss': 1.75422726, 'token_acc': 0.61797844, 'grad_norm': 0.24870136, 'learning_rate': 1.742e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06962, 'epoch': 0.27, 'global_step/max_steps': '1360/5000', 'percentage': '27.20%', 'elapsed_time': '5h 25m 19s', 'remaining_time': '14h 30m 43s'}
+Train:  27%|██▋       | 1360/5000 [5:25:19<14:25:42, 14.27s/it]Train:  27%|██▋       | 1360/5000 [5:25:19<14:25:42, 14.27s/it]Train:  27%|██▋       | 1361/5000 [5:25:33<14:26:04, 14.28s/it]Train:  27%|██▋       | 1362/5000 [5:25:48<14:25:37, 14.28s/it]Train:  27%|██▋       | 1363/5000 [5:26:02<14:25:40, 14.28s/it]Train:  27%|██▋       | 1364/5000 [5:26:16<14:24:57, 14.27s/it]Train:  27%|██▋       | 1365/5000 [5:26:30<14:25:07, 14.28s/it]Train:  27%|██▋       | 1366/5000 [5:26:45<14:25:07, 14.28s/it]Train:  27%|██▋       | 1367/5000 [5:26:59<14:25:32, 14.29s/it]Train:  27%|██▋       | 1368/5000 [5:27:13<14:26:16, 14.31s/it]Train:  27%|██▋       | 1369/5000 [5:27:28<14:26:03, 14.31s/it]Train:  27%|██▋       | 1370/5000 [5:27:42<14:25:26, 14.30s/it]                                                               {'loss': 1.76284218, 'token_acc': 0.61547397, 'grad_norm': 0.25126308, 'learning_rate': 1.738e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069623, 'epoch': 0.27, 'global_step/max_steps': '1370/5000', 'percentage': '27.40%', 'elapsed_time': '5h 27m 42s', 'remaining_time': '14h 28m 18s'}
+Train:  27%|██▋       | 1370/5000 [5:27:42<14:25:26, 14.30s/it]Train:  27%|██▋       | 1370/5000 [5:27:42<14:25:26, 14.30s/it]Train:  27%|██▋       | 1371/5000 [5:27:56<14:25:41, 14.31s/it]Train:  27%|██▋       | 1372/5000 [5:28:11<14:24:58, 14.31s/it]Train:  27%|██▋       | 1373/5000 [5:28:25<14:24:50, 14.31s/it]Train:  27%|██▋       | 1374/5000 [5:28:39<14:24:13, 14.30s/it]Train:  28%|██▊       | 1375/5000 [5:28:54<14:23:22, 14.29s/it]Train:  28%|██▊       | 1376/5000 [5:29:08<14:22:51, 14.29s/it]Train:  28%|██▊       | 1377/5000 [5:29:22<14:22:42, 14.29s/it]Train:  28%|██▊       | 1378/5000 [5:29:36<14:22:32, 14.29s/it]Train:  28%|██▊       | 1379/5000 [5:29:51<14:21:41, 14.28s/it]Train:  28%|██▊       | 1380/5000 [5:30:05<14:20:47, 14.27s/it]                                                               {'loss': 1.75093575, 'token_acc': 0.62491894, 'grad_norm': 0.25371331, 'learning_rate': 1.733e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069626, 'epoch': 0.28, 'global_step/max_steps': '1380/5000', 'percentage': '27.60%', 'elapsed_time': '5h 30m 5s', 'remaining_time': '14h 25m 53s'}
+Train:  28%|██▊       | 1380/5000 [5:30:05<14:20:47, 14.27s/it]Train:  28%|██▊       | 1380/5000 [5:30:05<14:20:47, 14.27s/it]Train:  28%|██▊       | 1381/5000 [5:30:19<14:20:47, 14.27s/it]Train:  28%|██▊       | 1382/5000 [5:30:33<14:20:34, 14.27s/it]Train:  28%|██▊       | 1383/5000 [5:30:48<14:20:35, 14.28s/it]Train:  28%|██▊       | 1384/5000 [5:31:02<14:20:28, 14.28s/it]Train:  28%|██▊       | 1385/5000 [5:31:16<14:19:53, 14.27s/it]Train:  28%|██▊       | 1386/5000 [5:31:30<14:19:13, 14.26s/it]Train:  28%|██▊       | 1387/5000 [5:31:45<14:19:43, 14.28s/it]Train:  28%|██▊       | 1388/5000 [5:31:59<14:19:22, 14.28s/it]Train:  28%|██▊       | 1389/5000 [5:32:13<14:19:49, 14.29s/it]Train:  28%|██▊       | 1390/5000 [5:32:28<14:19:47, 14.29s/it]                                                               {'loss': 1.76549301, 'token_acc': 0.62071086, 'grad_norm': 0.24636824, 'learning_rate': 1.729e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069628, 'epoch': 0.28, 'global_step/max_steps': '1390/5000', 'percentage': '27.80%', 'elapsed_time': '5h 32m 28s', 'remaining_time': '14h 23m 27s'}
+Train:  28%|██▊       | 1390/5000 [5:32:28<14:19:47, 14.29s/it]Train:  28%|██▊       | 1390/5000 [5:32:28<14:19:47, 14.29s/it]Train:  28%|██▊       | 1391/5000 [5:32:42<14:19:06, 14.28s/it]Train:  28%|██▊       | 1392/5000 [5:32:56<14:18:20, 14.27s/it]Train:  28%|██▊       | 1393/5000 [5:33:11<14:18:49, 14.29s/it]Train:  28%|██▊       | 1394/5000 [5:33:25<14:18:35, 14.29s/it]Train:  28%|██▊       | 1395/5000 [5:33:39<14:17:08, 14.27s/it]Train:  28%|██▊       | 1396/5000 [5:33:53<14:16:11, 14.25s/it]Train:  28%|██▊       | 1397/5000 [5:34:08<14:16:43, 14.27s/it]Train:  28%|██▊       | 1398/5000 [5:34:22<14:16:01, 14.26s/it]Train:  28%|██▊       | 1399/5000 [5:34:36<14:17:25, 14.29s/it]Train:  28%|██▊       | 1400/5000 [5:34:50<14:17:08, 14.29s/it]                                                               {'loss': 1.77000256, 'token_acc': 0.61981917, 'grad_norm': 0.25634906, 'learning_rate': 1.724e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069631, 'epoch': 0.28, 'global_step/max_steps': '1400/5000', 'percentage': '28.00%', 'elapsed_time': '5h 34m 50s', 'remaining_time': '14h 21m 2s'}
+Train:  28%|██▊       | 1400/5000 [5:34:50<14:17:08, 14.29s/it]Train:  28%|██▊       | 1400/5000 [5:34:50<14:17:08, 14.29s/it]Train:  28%|██▊       | 1401/5000 [5:35:05<14:17:24, 14.29s/it]Train:  28%|██▊       | 1402/5000 [5:35:19<14:17:12, 14.29s/it]Train:  28%|██▊       | 1403/5000 [5:35:33<14:16:55, 14.29s/it]Train:  28%|██▊       | 1404/5000 [5:35:48<14:16:06, 14.28s/it]Train:  28%|██▊       | 1405/5000 [5:36:02<14:15:01, 14.27s/it]Train:  28%|██▊       | 1406/5000 [5:36:16<14:14:54, 14.27s/it]Train:  28%|██▊       | 1407/5000 [5:36:30<14:14:43, 14.27s/it]Train:  28%|██▊       | 1408/5000 [5:36:45<14:14:14, 14.27s/it]Train:  28%|██▊       | 1409/5000 [5:36:59<14:13:21, 14.26s/it]Train:  28%|██▊       | 1410/5000 [5:37:13<14:12:24, 14.25s/it]                                                               {'loss': 1.76395741, 'token_acc': 0.60734791, 'grad_norm': 0.2370546, 'learning_rate': 1.72e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069635, 'epoch': 0.28, 'global_step/max_steps': '1410/5000', 'percentage': '28.20%', 'elapsed_time': '5h 37m 13s', 'remaining_time': '14h 18m 36s'}
+Train:  28%|██▊       | 1410/5000 [5:37:13<14:12:24, 14.25s/it]Train:  28%|██▊       | 1410/5000 [5:37:13<14:12:24, 14.25s/it]Train:  28%|██▊       | 1411/5000 [5:37:27<14:12:14, 14.25s/it]Train:  28%|██▊       | 1412/5000 [5:37:42<14:12:12, 14.25s/it]Train:  28%|██▊       | 1413/5000 [5:37:56<14:12:18, 14.26s/it]Train:  28%|██▊       | 1414/5000 [5:38:10<14:12:15, 14.26s/it]Train:  28%|██▊       | 1415/5000 [5:38:24<14:12:48, 14.27s/it]Train:  28%|██▊       | 1416/5000 [5:38:39<14:11:44, 14.26s/it]Train:  28%|██▊       | 1417/5000 [5:38:53<14:11:38, 14.26s/it]Train:  28%|██▊       | 1418/5000 [5:39:07<14:12:14, 14.28s/it]Train:  28%|██▊       | 1419/5000 [5:39:21<14:10:56, 14.26s/it]Train:  28%|██▊       | 1420/5000 [5:39:36<14:11:01, 14.26s/it]                                                               {'loss': 1.74737434, 'token_acc': 0.61451197, 'grad_norm': 0.25018984, 'learning_rate': 1.715e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069638, 'epoch': 0.28, 'global_step/max_steps': '1420/5000', 'percentage': '28.40%', 'elapsed_time': '5h 39m 36s', 'remaining_time': '14h 16m 11s'}
+Train:  28%|██▊       | 1420/5000 [5:39:36<14:11:01, 14.26s/it]Train:  28%|██▊       | 1420/5000 [5:39:36<14:11:01, 14.26s/it]Train:  28%|██▊       | 1421/5000 [5:39:50<14:10:02, 14.25s/it]Train:  28%|██▊       | 1422/5000 [5:40:04<14:10:20, 14.26s/it]Train:  28%|██▊       | 1423/5000 [5:40:19<14:11:42, 14.29s/it]Train:  28%|██▊       | 1424/5000 [5:40:33<14:11:21, 14.28s/it]Train:  28%|██▊       | 1425/5000 [5:40:47<14:11:07, 14.28s/it]Train:  29%|██▊       | 1426/5000 [5:41:01<14:10:01, 14.27s/it]Train:  29%|██▊       | 1427/5000 [5:41:16<14:09:28, 14.26s/it]Train:  29%|██▊       | 1428/5000 [5:41:30<14:09:53, 14.28s/it]Train:  29%|██▊       | 1429/5000 [5:41:44<14:09:42, 14.28s/it]Train:  29%|██▊       | 1430/5000 [5:41:58<14:09:07, 14.27s/it]                                                               {'loss': 1.75251427, 'token_acc': 0.61136385, 'grad_norm': 0.24493979, 'learning_rate': 1.711e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069641, 'epoch': 0.29, 'global_step/max_steps': '1430/5000', 'percentage': '28.60%', 'elapsed_time': '5h 41m 58s', 'remaining_time': '14h 13m 45s'}
+Train:  29%|██▊       | 1430/5000 [5:41:58<14:09:07, 14.27s/it]Train:  29%|██▊       | 1430/5000 [5:41:58<14:09:07, 14.27s/it]Train:  29%|██▊       | 1431/5000 [5:42:13<14:08:38, 14.27s/it]Train:  29%|██▊       | 1432/5000 [5:42:27<14:08:22, 14.27s/it]Train:  29%|██▊       | 1433/5000 [5:42:41<14:08:40, 14.28s/it]Train:  29%|██▊       | 1434/5000 [5:42:56<14:08:46, 14.28s/it]Train:  29%|██▊       | 1435/5000 [5:43:10<14:07:59, 14.27s/it]Train:  29%|██▊       | 1436/5000 [5:43:24<14:08:37, 14.29s/it]Train:  29%|██▊       | 1437/5000 [5:43:38<14:09:11, 14.30s/it]Train:  29%|██▉       | 1438/5000 [5:43:53<14:07:47, 14.28s/it]Train:  29%|██▉       | 1439/5000 [5:44:07<14:07:47, 14.28s/it]Train:  29%|██▉       | 1440/5000 [5:44:21<14:06:47, 14.27s/it]                                                               {'loss': 1.74620895, 'token_acc': 0.61699603, 'grad_norm': 0.25327456, 'learning_rate': 1.706e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069644, 'epoch': 0.29, 'global_step/max_steps': '1440/5000', 'percentage': '28.80%', 'elapsed_time': '5h 44m 21s', 'remaining_time': '14h 11m 20s'}
+Train:  29%|██▉       | 1440/5000 [5:44:21<14:06:47, 14.27s/it]Train:  29%|██▉       | 1440/5000 [5:44:21<14:06:47, 14.27s/it]Train:  29%|██▉       | 1441/5000 [5:44:36<14:07:00, 14.28s/it]Train:  29%|██▉       | 1442/5000 [5:44:50<14:06:13, 14.27s/it]Train:  29%|██▉       | 1443/5000 [5:45:04<14:06:11, 14.27s/it]Train:  29%|██▉       | 1444/5000 [5:45:18<14:05:47, 14.27s/it]Train:  29%|██▉       | 1445/5000 [5:45:33<14:05:45, 14.27s/it]Train:  29%|██▉       | 1446/5000 [5:45:47<14:06:08, 14.28s/it]Train:  29%|██▉       | 1447/5000 [5:46:01<14:06:18, 14.29s/it]Train:  29%|██▉       | 1448/5000 [5:46:15<14:05:11, 14.28s/it]Train:  29%|██▉       | 1449/5000 [5:46:30<14:04:57, 14.28s/it]Train:  29%|██▉       | 1450/5000 [5:46:44<14:05:07, 14.28s/it]                                                               {'loss': 1.74732933, 'token_acc': 0.61598372, 'grad_norm': 0.23996194, 'learning_rate': 1.701e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069646, 'epoch': 0.29, 'global_step/max_steps': '1450/5000', 'percentage': '29.00%', 'elapsed_time': '5h 46m 44s', 'remaining_time': '14h 8m 55s'}
+Train:  29%|██▉       | 1450/5000 [5:46:44<14:05:07, 14.28s/it]Train:  29%|██▉       | 1450/5000 [5:46:44<14:05:07, 14.28s/it]Train:  29%|██▉       | 1451/5000 [5:46:58<14:04:51, 14.28s/it]Train:  29%|██▉       | 1452/5000 [5:47:13<14:04:04, 14.27s/it]Train:  29%|██▉       | 1453/5000 [5:47:27<14:04:36, 14.29s/it]Train:  29%|██▉       | 1454/5000 [5:47:41<14:04:11, 14.28s/it]Train:  29%|██▉       | 1455/5000 [5:47:55<14:03:24, 14.27s/it]Train:  29%|██▉       | 1456/5000 [5:48:10<14:03:53, 14.29s/it]Train:  29%|██▉       | 1457/5000 [5:48:24<14:03:09, 14.28s/it]Train:  29%|██▉       | 1458/5000 [5:48:38<14:02:25, 14.27s/it]Train:  29%|██▉       | 1459/5000 [5:48:53<14:02:22, 14.27s/it]Train:  29%|██▉       | 1460/5000 [5:49:07<14:01:36, 14.26s/it]                                                               {'loss': 1.75771923, 'token_acc': 0.6134027, 'grad_norm': 0.25894853, 'learning_rate': 1.697e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069649, 'epoch': 0.29, 'global_step/max_steps': '1460/5000', 'percentage': '29.20%', 'elapsed_time': '5h 49m 7s', 'remaining_time': '14h 6m 30s'}
+Train:  29%|██▉       | 1460/5000 [5:49:07<14:01:36, 14.26s/it]Train:  29%|██▉       | 1460/5000 [5:49:07<14:01:36, 14.26s/it]Train:  29%|██▉       | 1461/5000 [5:49:21<14:02:13, 14.28s/it]Train:  29%|██▉       | 1462/5000 [5:49:35<14:00:55, 14.26s/it]Train:  29%|██▉       | 1463/5000 [5:49:50<14:00:35, 14.26s/it]Train:  29%|██▉       | 1464/5000 [5:50:04<14:00:35, 14.26s/it]Train:  29%|██▉       | 1465/5000 [5:50:18<14:01:18, 14.28s/it]Train:  29%|██▉       | 1466/5000 [5:50:32<14:00:26, 14.27s/it]Train:  29%|██▉       | 1467/5000 [5:50:47<14:01:08, 14.28s/it]Train:  29%|██▉       | 1468/5000 [5:51:01<14:01:10, 14.29s/it]Train:  29%|██▉       | 1469/5000 [5:51:15<14:01:18, 14.30s/it]Train:  29%|██▉       | 1470/5000 [5:51:30<14:01:01, 14.30s/it]                                                               {'loss': 1.76416283, 'token_acc': 0.61649531, 'grad_norm': 0.24493071, 'learning_rate': 1.692e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069651, 'epoch': 0.29, 'global_step/max_steps': '1470/5000', 'percentage': '29.40%', 'elapsed_time': '5h 51m 30s', 'remaining_time': '14h 4m 5s'}
+Train:  29%|██▉       | 1470/5000 [5:51:30<14:01:01, 14.30s/it]Train:  29%|██▉       | 1470/5000 [5:51:30<14:01:01, 14.30s/it]Train:  29%|██▉       | 1471/5000 [5:51:44<14:01:21, 14.30s/it]Train:  29%|██▉       | 1472/5000 [5:51:58<14:00:54, 14.30s/it]Train:  29%|██▉       | 1473/5000 [5:52:13<13:59:57, 14.29s/it]Train:  29%|██▉       | 1474/5000 [5:52:27<13:59:11, 14.28s/it]Train:  30%|██▉       | 1475/5000 [5:52:41<13:59:17, 14.29s/it]Train:  30%|██▉       | 1476/5000 [5:52:55<13:59:20, 14.29s/it]Train:  30%|██▉       | 1477/5000 [5:53:10<13:58:15, 14.28s/it]Train:  30%|██▉       | 1478/5000 [5:53:24<13:58:44, 14.29s/it]Train:  30%|██▉       | 1479/5000 [5:53:38<13:58:12, 14.28s/it]Train:  30%|██▉       | 1480/5000 [5:53:52<13:57:34, 14.28s/it]                                                               {'loss': 1.74321136, 'token_acc': 0.62227117, 'grad_norm': 0.25171268, 'learning_rate': 1.687e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069654, 'epoch': 0.3, 'global_step/max_steps': '1480/5000', 'percentage': '29.60%', 'elapsed_time': '5h 53m 52s', 'remaining_time': '14h 1m 40s'}
+Train:  30%|██▉       | 1480/5000 [5:53:52<13:57:34, 14.28s/it]Train:  30%|██▉       | 1480/5000 [5:53:52<13:57:34, 14.28s/it]Train:  30%|██▉       | 1481/5000 [5:54:07<13:57:57, 14.29s/it]Train:  30%|██▉       | 1482/5000 [5:54:21<13:57:11, 14.28s/it]Train:  30%|██▉       | 1483/5000 [5:54:35<13:56:37, 14.27s/it]Train:  30%|██▉       | 1484/5000 [5:54:50<13:56:45, 14.28s/it]Train:  30%|██▉       | 1485/5000 [5:55:04<13:56:12, 14.27s/it]Train:  30%|██▉       | 1486/5000 [5:55:18<13:55:58, 14.27s/it]Train:  30%|██▉       | 1487/5000 [5:55:32<13:55:43, 14.27s/it]Train:  30%|██▉       | 1488/5000 [5:55:47<13:55:59, 14.28s/it]Train:  30%|██▉       | 1489/5000 [5:56:01<13:56:18, 14.29s/it]Train:  30%|██▉       | 1490/5000 [5:56:15<13:56:05, 14.29s/it]                                                               {'loss': 1.74793625, 'token_acc': 0.61216916, 'grad_norm': 0.24730948, 'learning_rate': 1.682e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069656, 'epoch': 0.3, 'global_step/max_steps': '1490/5000', 'percentage': '29.80%', 'elapsed_time': '5h 56m 15s', 'remaining_time': '13h 59m 15s'}
+Train:  30%|██▉       | 1490/5000 [5:56:15<13:56:05, 14.29s/it]Train:  30%|██▉       | 1490/5000 [5:56:15<13:56:05, 14.29s/it]Train:  30%|██▉       | 1491/5000 [5:56:30<13:55:09, 14.28s/it]Train:  30%|██▉       | 1492/5000 [5:56:44<13:55:02, 14.28s/it]Train:  30%|██▉       | 1493/5000 [5:56:58<13:54:22, 14.28s/it]Train:  30%|██▉       | 1494/5000 [5:57:12<13:54:27, 14.28s/it]Train:  30%|██▉       | 1495/5000 [5:57:27<13:54:26, 14.28s/it]Train:  30%|██▉       | 1496/5000 [5:57:41<13:53:59, 14.28s/it]Train:  30%|██▉       | 1497/5000 [5:57:55<13:53:54, 14.28s/it]Train:  30%|██▉       | 1498/5000 [5:58:10<13:53:39, 14.28s/it]Train:  30%|██▉       | 1499/5000 [5:58:24<13:53:57, 14.29s/it]Train:  30%|███       | 1500/5000 [5:58:38<13:53:50, 14.29s/it]                                                               {'loss': 1.74955978, 'token_acc': 0.61759232, 'grad_norm': 0.24460146, 'learning_rate': 1.677e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069659, 'epoch': 0.3, 'global_step/max_steps': '1500/5000', 'percentage': '30.00%', 'elapsed_time': '5h 58m 38s', 'remaining_time': '13h 56m 50s'}
+Train:  30%|███       | 1500/5000 [5:58:38<13:53:50, 14.29s/it]Train:  30%|███       | 1500/5000 [5:58:38<13:53:50, 14.29s/it]Train:  30%|███       | 1501/5000 [5:58:52<13:53:32, 14.29s/it]Train:  30%|███       | 1502/5000 [5:59:07<13:53:47, 14.30s/it]Train:  30%|███       | 1503/5000 [5:59:21<13:53:23, 14.30s/it]Train:  30%|███       | 1504/5000 [5:59:35<13:53:27, 14.30s/it]Train:  30%|███       | 1505/5000 [5:59:50<13:52:46, 14.30s/it]Train:  30%|███       | 1506/5000 [6:00:04<13:52:44, 14.30s/it]Train:  30%|███       | 1507/5000 [6:00:18<13:52:48, 14.31s/it]Train:  30%|███       | 1508/5000 [6:00:33<13:51:48, 14.29s/it]Train:  30%|███       | 1509/5000 [6:00:47<13:51:36, 14.29s/it]Train:  30%|███       | 1510/5000 [6:01:01<13:51:48, 14.30s/it]                                                               {'loss': 1.75902061, 'token_acc': 0.62240832, 'grad_norm': 0.24809849, 'learning_rate': 1.672e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06966, 'epoch': 0.3, 'global_step/max_steps': '1510/5000', 'percentage': '30.20%', 'elapsed_time': '6h 1m 1s', 'remaining_time': '13h 54m 25s'}
+Train:  30%|███       | 1510/5000 [6:01:01<13:51:48, 14.30s/it]Train:  30%|███       | 1510/5000 [6:01:01<13:51:48, 14.30s/it]Train:  30%|███       | 1511/5000 [6:01:15<13:50:51, 14.29s/it]Train:  30%|███       | 1512/5000 [6:01:30<13:50:37, 14.29s/it]Train:  30%|███       | 1513/5000 [6:01:44<13:50:37, 14.29s/it]Train:  30%|███       | 1514/5000 [6:01:58<13:50:02, 14.29s/it]Train:  30%|███       | 1515/5000 [6:02:13<13:49:26, 14.28s/it]Train:  30%|███       | 1516/5000 [6:02:27<13:50:07, 14.30s/it]Train:  30%|███       | 1517/5000 [6:02:41<13:49:46, 14.29s/it]Train:  30%|███       | 1518/5000 [6:02:55<13:49:09, 14.29s/it]Train:  30%|███       | 1519/5000 [6:03:10<13:48:40, 14.28s/it]Train:  30%|███       | 1520/5000 [6:03:24<13:48:36, 14.29s/it]                                                               {'loss': 1.75345001, 'token_acc': 0.61670975, 'grad_norm': 0.24449266, 'learning_rate': 1.667e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069663, 'epoch': 0.3, 'global_step/max_steps': '1520/5000', 'percentage': '30.40%', 'elapsed_time': '6h 3m 24s', 'remaining_time': '13h 52m 0s'}
+Train:  30%|███       | 1520/5000 [6:03:24<13:48:36, 14.29s/it]Train:  30%|███       | 1520/5000 [6:03:24<13:48:36, 14.29s/it]Train:  30%|███       | 1521/5000 [6:03:38<13:48:36, 14.29s/it]Train:  30%|███       | 1522/5000 [6:03:53<13:47:53, 14.28s/it]Train:  30%|███       | 1523/5000 [6:04:07<13:46:50, 14.27s/it]Train:  30%|███       | 1524/5000 [6:04:21<13:49:22, 14.32s/it]Train:  30%|███       | 1525/5000 [6:04:35<13:48:13, 14.30s/it]Train:  31%|███       | 1526/5000 [6:04:50<13:47:59, 14.30s/it]Train:  31%|███       | 1527/5000 [6:05:04<13:47:27, 14.30s/it]Train:  31%|███       | 1528/5000 [6:05:18<13:46:46, 14.29s/it]Train:  31%|███       | 1529/5000 [6:05:33<13:46:24, 14.29s/it]Train:  31%|███       | 1530/5000 [6:05:47<13:46:01, 14.28s/it]                                                               {'loss': 1.74871826, 'token_acc': 0.61858957, 'grad_norm': 0.24540664, 'learning_rate': 1.663e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069665, 'epoch': 0.31, 'global_step/max_steps': '1530/5000', 'percentage': '30.60%', 'elapsed_time': '6h 5m 47s', 'remaining_time': '13h 49m 36s'}
+Train:  31%|███       | 1530/5000 [6:05:47<13:46:01, 14.28s/it]Train:  31%|███       | 1530/5000 [6:05:47<13:46:01, 14.28s/it]Train:  31%|███       | 1531/5000 [6:06:01<13:45:39, 14.28s/it]Train:  31%|███       | 1532/5000 [6:06:15<13:45:12, 14.28s/it]Train:  31%|███       | 1533/5000 [6:06:30<13:45:17, 14.28s/it]Train:  31%|███       | 1534/5000 [6:06:44<13:44:31, 14.27s/it]Train:  31%|███       | 1535/5000 [6:06:58<13:44:14, 14.27s/it]Train:  31%|███       | 1536/5000 [6:07:13<13:44:23, 14.28s/it]Train:  31%|███       | 1537/5000 [6:07:27<13:43:34, 14.27s/it]Train:  31%|███       | 1538/5000 [6:07:41<13:43:03, 14.26s/it]Train:  31%|███       | 1539/5000 [6:07:55<13:42:06, 14.25s/it]Train:  31%|███       | 1540/5000 [6:08:10<13:41:21, 14.24s/it]                                                               {'loss': 1.7432869, 'token_acc': 0.61944118, 'grad_norm': 0.25078079, 'learning_rate': 1.658e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069668, 'epoch': 0.31, 'global_step/max_steps': '1540/5000', 'percentage': '30.80%', 'elapsed_time': '6h 8m 10s', 'remaining_time': '13h 47m 10s'}
+Train:  31%|███       | 1540/5000 [6:08:10<13:41:21, 14.24s/it]Train:  31%|███       | 1540/5000 [6:08:10<13:41:21, 14.24s/it]Train:  31%|███       | 1541/5000 [6:08:24<13:42:39, 14.27s/it]Train:  31%|███       | 1542/5000 [6:08:38<13:42:13, 14.27s/it]Train:  31%|███       | 1543/5000 [6:08:52<13:42:50, 14.28s/it]Train:  31%|███       | 1544/5000 [6:09:07<13:42:11, 14.27s/it]Train:  31%|███       | 1545/5000 [6:09:21<13:41:58, 14.27s/it]Train:  31%|███       | 1546/5000 [6:09:35<13:41:17, 14.27s/it]Train:  31%|███       | 1547/5000 [6:09:50<13:42:01, 14.28s/it]Train:  31%|███       | 1548/5000 [6:10:04<13:41:01, 14.27s/it]Train:  31%|███       | 1549/5000 [6:10:18<13:40:06, 14.26s/it]Train:  31%|███       | 1550/5000 [6:10:32<13:40:33, 14.27s/it]                                                               {'loss': 1.73539925, 'token_acc': 0.62634075, 'grad_norm': 0.24890281, 'learning_rate': 1.653e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06967, 'epoch': 0.31, 'global_step/max_steps': '1550/5000', 'percentage': '31.00%', 'elapsed_time': '6h 10m 32s', 'remaining_time': '13h 44m 45s'}
+Train:  31%|███       | 1550/5000 [6:10:32<13:40:33, 14.27s/it]Train:  31%|███       | 1550/5000 [6:10:32<13:40:33, 14.27s/it]Train:  31%|███       | 1551/5000 [6:10:47<13:40:08, 14.27s/it]Train:  31%|███       | 1552/5000 [6:11:01<13:40:19, 14.27s/it]Train:  31%|███       | 1553/5000 [6:11:15<13:39:24, 14.26s/it]Train:  31%|███       | 1554/5000 [6:11:29<13:39:20, 14.27s/it]Train:  31%|███       | 1555/5000 [6:11:44<13:38:30, 14.26s/it]Train:  31%|███       | 1556/5000 [6:11:58<13:38:05, 14.25s/it]Train:  31%|███       | 1557/5000 [6:12:12<13:38:12, 14.26s/it]Train:  31%|███       | 1558/5000 [6:12:26<13:38:28, 14.27s/it]Train:  31%|███       | 1559/5000 [6:12:41<13:38:26, 14.27s/it]Train:  31%|███       | 1560/5000 [6:12:55<13:38:19, 14.27s/it]                                                               {'loss': 1.74602413, 'token_acc': 0.61294132, 'grad_norm': 0.25645784, 'learning_rate': 1.648e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069673, 'epoch': 0.31, 'global_step/max_steps': '1560/5000', 'percentage': '31.20%', 'elapsed_time': '6h 12m 55s', 'remaining_time': '13h 42m 20s'}
+Train:  31%|███       | 1560/5000 [6:12:55<13:38:19, 14.27s/it]Train:  31%|███       | 1560/5000 [6:12:55<13:38:19, 14.27s/it]Train:  31%|███       | 1561/5000 [6:13:09<13:38:08, 14.27s/it]Train:  31%|███       | 1562/5000 [6:13:23<13:37:48, 14.27s/it]Train:  31%|███▏      | 1563/5000 [6:13:38<13:38:25, 14.29s/it]Train:  31%|███▏      | 1564/5000 [6:13:52<13:38:21, 14.29s/it]Train:  31%|███▏      | 1565/5000 [6:14:06<13:39:06, 14.31s/it]Train:  31%|███▏      | 1566/5000 [6:14:21<13:38:31, 14.30s/it]Train:  31%|███▏      | 1567/5000 [6:14:35<13:37:18, 14.28s/it]Train:  31%|███▏      | 1568/5000 [6:14:49<13:36:44, 14.28s/it]Train:  31%|███▏      | 1569/5000 [6:15:04<13:37:12, 14.29s/it]Train:  31%|███▏      | 1570/5000 [6:15:18<13:36:29, 14.28s/it]                                                               {'loss': 1.75034256, 'token_acc': 0.61863711, 'grad_norm': 0.24469014, 'learning_rate': 1.643e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069675, 'epoch': 0.31, 'global_step/max_steps': '1570/5000', 'percentage': '31.40%', 'elapsed_time': '6h 15m 18s', 'remaining_time': '13h 39m 56s'}
+Train:  31%|███▏      | 1570/5000 [6:15:18<13:36:29, 14.28s/it]Train:  31%|███▏      | 1570/5000 [6:15:18<13:36:29, 14.28s/it]Train:  31%|███▏      | 1571/5000 [6:15:32<13:35:55, 14.28s/it]Train:  31%|███▏      | 1572/5000 [6:15:46<13:35:26, 14.27s/it]Train:  31%|███▏      | 1573/5000 [6:16:01<13:35:29, 14.28s/it]Train:  31%|███▏      | 1574/5000 [6:16:15<13:34:59, 14.27s/it]Train:  32%|███▏      | 1575/5000 [6:16:29<13:35:08, 14.28s/it]Train:  32%|███▏      | 1576/5000 [6:16:44<13:35:04, 14.28s/it]Train:  32%|███▏      | 1577/5000 [6:16:58<13:34:35, 14.28s/it]Train:  32%|███▏      | 1578/5000 [6:17:12<13:35:26, 14.30s/it]Train:  32%|███▏      | 1579/5000 [6:17:26<13:35:02, 14.29s/it]Train:  32%|███▏      | 1580/5000 [6:17:41<13:34:26, 14.29s/it]                                                               {'loss': 1.73458424, 'token_acc': 0.62483413, 'grad_norm': 0.24303232, 'learning_rate': 1.637e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069677, 'epoch': 0.32, 'global_step/max_steps': '1580/5000', 'percentage': '31.60%', 'elapsed_time': '6h 17m 41s', 'remaining_time': '13h 37m 31s'}
+Train:  32%|███▏      | 1580/5000 [6:17:41<13:34:26, 14.29s/it]Train:  32%|███▏      | 1580/5000 [6:17:41<13:34:26, 14.29s/it]Train:  32%|███▏      | 1581/5000 [6:17:55<13:35:26, 14.31s/it]Train:  32%|███▏      | 1582/5000 [6:18:09<13:34:17, 14.29s/it]Train:  32%|███▏      | 1583/5000 [6:18:24<13:34:12, 14.30s/it]Train:  32%|███▏      | 1584/5000 [6:18:38<13:33:36, 14.29s/it]Train:  32%|███▏      | 1585/5000 [6:18:52<13:32:27, 14.27s/it]Train:  32%|███▏      | 1586/5000 [6:19:06<13:32:03, 14.27s/it]Train:  32%|███▏      | 1587/5000 [6:19:21<13:32:14, 14.28s/it]Train:  32%|███▏      | 1588/5000 [6:19:35<13:32:07, 14.28s/it]Train:  32%|███▏      | 1589/5000 [6:19:49<13:32:12, 14.29s/it]Train:  32%|███▏      | 1590/5000 [6:20:04<13:32:17, 14.29s/it]                                                               {'loss': 1.74859295, 'token_acc': 0.61733748, 'grad_norm': 0.25500831, 'learning_rate': 1.632e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069679, 'epoch': 0.32, 'global_step/max_steps': '1590/5000', 'percentage': '31.80%', 'elapsed_time': '6h 20m 4s', 'remaining_time': '13h 35m 6s'}
+Train:  32%|███▏      | 1590/5000 [6:20:04<13:32:17, 14.29s/it]Train:  32%|███▏      | 1590/5000 [6:20:04<13:32:17, 14.29s/it]Train:  32%|███▏      | 1591/5000 [6:20:18<13:32:02, 14.29s/it]Train:  32%|███▏      | 1592/5000 [6:20:32<13:31:52, 14.29s/it]Train:  32%|███▏      | 1593/5000 [6:20:46<13:31:39, 14.29s/it]Train:  32%|███▏      | 1594/5000 [6:21:01<13:30:54, 14.28s/it]Train:  32%|███▏      | 1595/5000 [6:21:15<13:30:42, 14.29s/it]Train:  32%|███▏      | 1596/5000 [6:21:29<13:30:39, 14.29s/it]Train:  32%|███▏      | 1597/5000 [6:21:44<13:30:16, 14.29s/it]Train:  32%|███▏      | 1598/5000 [6:21:58<13:30:19, 14.29s/it]Train:  32%|███▏      | 1599/5000 [6:22:12<13:30:26, 14.30s/it]Train:  32%|███▏      | 1600/5000 [6:22:26<13:30:16, 14.30s/it]                                                               {'loss': 1.7511755, 'token_acc': 0.62260207, 'grad_norm': 0.24567893, 'learning_rate': 1.627e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069681, 'epoch': 0.32, 'global_step/max_steps': '1600/5000', 'percentage': '32.00%', 'elapsed_time': '6h 22m 26s', 'remaining_time': '13h 32m 42s'}
+Train:  32%|███▏      | 1600/5000 [6:22:26<13:30:16, 14.30s/it]Train:  32%|███▏      | 1600/5000 [6:22:26<13:30:16, 14.30s/it]Train:  32%|███▏      | 1601/5000 [6:22:41<13:29:46, 14.29s/it]Train:  32%|███▏      | 1602/5000 [6:22:55<13:30:15, 14.31s/it]Train:  32%|███▏      | 1603/5000 [6:23:09<13:30:10, 14.31s/it]Train:  32%|███▏      | 1604/5000 [6:23:24<13:29:52, 14.31s/it]Train:  32%|███▏      | 1605/5000 [6:23:38<13:29:45, 14.31s/it]Train:  32%|███▏      | 1606/5000 [6:23:52<13:29:24, 14.31s/it]Train:  32%|███▏      | 1607/5000 [6:24:07<13:29:27, 14.31s/it]Train:  32%|███▏      | 1608/5000 [6:24:21<13:29:25, 14.32s/it]Train:  32%|███▏      | 1609/5000 [6:24:35<13:28:39, 14.31s/it]Train:  32%|███▏      | 1610/5000 [6:24:50<13:27:32, 14.29s/it]                                                               {'loss': 1.74280548, 'token_acc': 0.62357658, 'grad_norm': 0.24632891, 'learning_rate': 1.622e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069682, 'epoch': 0.32, 'global_step/max_steps': '1610/5000', 'percentage': '32.20%', 'elapsed_time': '6h 24m 50s', 'remaining_time': '13h 30m 18s'}
+Train:  32%|███▏      | 1610/5000 [6:24:50<13:27:32, 14.29s/it]Train:  32%|███▏      | 1610/5000 [6:24:50<13:27:32, 14.29s/it]Train:  32%|███▏      | 1611/5000 [6:25:04<13:27:43, 14.30s/it]Train:  32%|███▏      | 1612/5000 [6:25:18<13:27:40, 14.30s/it]Train:  32%|███▏      | 1613/5000 [6:25:33<13:27:53, 14.31s/it]Train:  32%|███▏      | 1614/5000 [6:25:47<13:27:27, 14.31s/it]Train:  32%|███▏      | 1615/5000 [6:26:01<13:26:02, 14.29s/it]Train:  32%|███▏      | 1616/5000 [6:26:15<13:26:30, 14.30s/it]Train:  32%|███▏      | 1617/5000 [6:26:30<13:25:38, 14.29s/it]Train:  32%|███▏      | 1618/5000 [6:26:44<13:25:44, 14.29s/it]Train:  32%|███▏      | 1619/5000 [6:26:58<13:26:06, 14.31s/it]Train:  32%|███▏      | 1620/5000 [6:27:13<13:25:47, 14.30s/it]                                                               {'loss': 1.75086346, 'token_acc': 0.61874163, 'grad_norm': 0.24713333, 'learning_rate': 1.617e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069683, 'epoch': 0.32, 'global_step/max_steps': '1620/5000', 'percentage': '32.40%', 'elapsed_time': '6h 27m 13s', 'remaining_time': '13h 27m 53s'}
+Train:  32%|███▏      | 1620/5000 [6:27:13<13:25:47, 14.30s/it]Train:  32%|███▏      | 1620/5000 [6:27:13<13:25:47, 14.30s/it]Train:  32%|███▏      | 1621/5000 [6:27:27<13:26:00, 14.31s/it]Train:  32%|███▏      | 1622/5000 [6:27:41<13:25:39, 14.31s/it]Train:  32%|███▏      | 1623/5000 [6:27:56<13:26:02, 14.32s/it]Train:  32%|███▏      | 1624/5000 [6:28:10<13:25:31, 14.32s/it]Train:  32%|███▎      | 1625/5000 [6:28:24<13:24:48, 14.31s/it]Train:  33%|███▎      | 1626/5000 [6:28:38<13:24:10, 14.30s/it]Train:  33%|███▎      | 1627/5000 [6:28:53<13:23:27, 14.29s/it]Train:  33%|███▎      | 1628/5000 [6:29:07<13:23:25, 14.30s/it]Train:  33%|███▎      | 1629/5000 [6:29:21<13:22:52, 14.29s/it]Train:  33%|███▎      | 1630/5000 [6:29:36<13:22:12, 14.28s/it]                                                               {'loss': 1.73988514, 'token_acc': 0.61732095, 'grad_norm': 0.24220666, 'learning_rate': 1.612e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069685, 'epoch': 0.33, 'global_step/max_steps': '1630/5000', 'percentage': '32.60%', 'elapsed_time': '6h 29m 36s', 'remaining_time': '13h 25m 29s'}
+Train:  33%|███▎      | 1630/5000 [6:29:36<13:22:12, 14.28s/it]Train:  33%|███▎      | 1630/5000 [6:29:36<13:22:12, 14.28s/it]Train:  33%|███▎      | 1631/5000 [6:29:50<13:21:52, 14.28s/it]Train:  33%|███▎      | 1632/5000 [6:30:04<13:22:00, 14.29s/it]Train:  33%|███▎      | 1633/5000 [6:30:18<13:22:51, 14.31s/it]Train:  33%|███▎      | 1634/5000 [6:30:33<13:22:00, 14.30s/it]Train:  33%|███▎      | 1635/5000 [6:30:47<13:21:21, 14.29s/it]Train:  33%|███▎      | 1636/5000 [6:31:01<13:21:05, 14.29s/it]Train:  33%|███▎      | 1637/5000 [6:31:16<13:20:59, 14.29s/it]Train:  33%|███▎      | 1638/5000 [6:31:30<13:21:47, 14.31s/it]Train:  33%|███▎      | 1639/5000 [6:31:44<13:21:35, 14.31s/it]Train:  33%|███▎      | 1640/5000 [6:31:59<13:21:59, 14.32s/it]                                                               {'loss': 1.73453369, 'token_acc': 0.61953366, 'grad_norm': 0.2807079, 'learning_rate': 1.606e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069686, 'epoch': 0.33, 'global_step/max_steps': '1640/5000', 'percentage': '32.80%', 'elapsed_time': '6h 31m 59s', 'remaining_time': '13h 23m 5s'}
+Train:  33%|███▎      | 1640/5000 [6:31:59<13:21:59, 14.32s/it]Train:  33%|███▎      | 1640/5000 [6:31:59<13:21:59, 14.32s/it]Train:  33%|███▎      | 1641/5000 [6:32:13<13:21:07, 14.31s/it]Train:  33%|███▎      | 1642/5000 [6:32:27<13:20:19, 14.30s/it]Train:  33%|███▎      | 1643/5000 [6:32:41<13:19:44, 14.29s/it]Train:  33%|███▎      | 1644/5000 [6:32:56<13:19:28, 14.29s/it]Train:  33%|███▎      | 1645/5000 [6:33:10<13:18:33, 14.28s/it]Train:  33%|███▎      | 1646/5000 [6:33:24<13:17:56, 14.27s/it]Train:  33%|███▎      | 1647/5000 [6:33:39<13:17:52, 14.28s/it]Train:  33%|███▎      | 1648/5000 [6:33:53<13:18:14, 14.29s/it]Train:  33%|███▎      | 1649/5000 [6:34:07<13:17:55, 14.29s/it]Train:  33%|███▎      | 1650/5000 [6:34:21<13:17:15, 14.28s/it]                                                               {'loss': 1.74738636, 'token_acc': 0.61650926, 'grad_norm': 0.24479362, 'learning_rate': 1.601e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069688, 'epoch': 0.33, 'global_step/max_steps': '1650/5000', 'percentage': '33.00%', 'elapsed_time': '6h 34m 21s', 'remaining_time': '13h 20m 40s'}
+Train:  33%|███▎      | 1650/5000 [6:34:21<13:17:15, 14.28s/it]Train:  33%|███▎      | 1650/5000 [6:34:21<13:17:15, 14.28s/it]Train:  33%|███▎      | 1651/5000 [6:34:36<13:16:58, 14.28s/it]Train:  33%|███▎      | 1652/5000 [6:34:50<13:16:15, 14.27s/it]Train:  33%|███▎      | 1653/5000 [6:35:04<13:16:23, 14.28s/it]Train:  33%|███▎      | 1654/5000 [6:35:18<13:15:37, 14.27s/it]Train:  33%|███▎      | 1655/5000 [6:35:33<13:15:57, 14.28s/it]Train:  33%|███▎      | 1656/5000 [6:35:47<13:15:33, 14.27s/it]Train:  33%|███▎      | 1657/5000 [6:36:01<13:15:23, 14.28s/it]Train:  33%|███▎      | 1658/5000 [6:36:16<13:15:14, 14.28s/it]Train:  33%|███▎      | 1659/5000 [6:36:30<13:15:23, 14.28s/it]Train:  33%|███▎      | 1660/5000 [6:36:44<13:15:02, 14.28s/it]                                                               {'loss': 1.75399284, 'token_acc': 0.61696013, 'grad_norm': 0.24470943, 'learning_rate': 1.596e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06969, 'epoch': 0.33, 'global_step/max_steps': '1660/5000', 'percentage': '33.20%', 'elapsed_time': '6h 36m 44s', 'remaining_time': '13h 18m 16s'}
+Train:  33%|███▎      | 1660/5000 [6:36:44<13:15:02, 14.28s/it]Train:  33%|███▎      | 1660/5000 [6:36:44<13:15:02, 14.28s/it]Train:  33%|███▎      | 1661/5000 [6:36:59<13:15:27, 14.29s/it]Train:  33%|███▎      | 1662/5000 [6:37:13<13:15:22, 14.30s/it]Train:  33%|███▎      | 1663/5000 [6:37:27<13:14:14, 14.28s/it]Train:  33%|███▎      | 1664/5000 [6:37:41<13:13:45, 14.28s/it]Train:  33%|███▎      | 1665/5000 [6:37:56<13:13:18, 14.27s/it]Train:  33%|███▎      | 1666/5000 [6:38:10<13:13:07, 14.27s/it]Train:  33%|███▎      | 1667/5000 [6:38:24<13:13:25, 14.28s/it]Train:  33%|███▎      | 1668/5000 [6:38:38<13:12:51, 14.28s/it]Train:  33%|███▎      | 1669/5000 [6:38:53<13:13:11, 14.29s/it]Train:  33%|███▎      | 1670/5000 [6:39:07<13:12:58, 14.29s/it]                                                               {'loss': 1.73749638, 'token_acc': 0.62196806, 'grad_norm': 0.2384062, 'learning_rate': 1.59e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069692, 'epoch': 0.33, 'global_step/max_steps': '1670/5000', 'percentage': '33.40%', 'elapsed_time': '6h 39m 7s', 'remaining_time': '13h 15m 51s'}
+Train:  33%|███▎      | 1670/5000 [6:39:07<13:12:58, 14.29s/it]Train:  33%|███▎      | 1670/5000 [6:39:07<13:12:58, 14.29s/it]Train:  33%|███▎      | 1671/5000 [6:39:21<13:12:07, 14.28s/it]Train:  33%|███▎      | 1672/5000 [6:39:36<13:11:35, 14.27s/it]Train:  33%|███▎      | 1673/5000 [6:39:50<13:11:58, 14.28s/it]Train:  33%|███▎      | 1674/5000 [6:40:04<13:11:25, 14.28s/it]Train:  34%|███▎      | 1675/5000 [6:40:18<13:11:46, 14.29s/it]Train:  34%|███▎      | 1676/5000 [6:40:33<13:11:07, 14.28s/it]Train:  34%|███▎      | 1677/5000 [6:40:47<13:11:01, 14.28s/it]Train:  34%|███▎      | 1678/5000 [6:41:01<13:10:05, 14.27s/it]Train:  34%|███▎      | 1679/5000 [6:41:16<13:10:28, 14.28s/it]Train:  34%|███▎      | 1680/5000 [6:41:30<13:10:35, 14.29s/it]                                                               {'loss': 1.73654137, 'token_acc': 0.61909102, 'grad_norm': 0.24469741, 'learning_rate': 1.585e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069694, 'epoch': 0.34, 'global_step/max_steps': '1680/5000', 'percentage': '33.60%', 'elapsed_time': '6h 41m 30s', 'remaining_time': '13h 13m 27s'}
+Train:  34%|███▎      | 1680/5000 [6:41:30<13:10:35, 14.29s/it]Train:  34%|███▎      | 1680/5000 [6:41:30<13:10:35, 14.29s/it]Train:  34%|███▎      | 1681/5000 [6:41:44<13:10:31, 14.29s/it]Train:  34%|███▎      | 1682/5000 [6:41:58<13:10:46, 14.30s/it]Train:  34%|███▎      | 1683/5000 [6:42:13<13:10:08, 14.29s/it]Train:  34%|███▎      | 1684/5000 [6:42:27<13:09:16, 14.28s/it]Train:  34%|███▎      | 1685/5000 [6:42:41<13:08:12, 14.27s/it]Train:  34%|███▎      | 1686/5000 [6:42:55<13:07:57, 14.27s/it]Train:  34%|███▎      | 1687/5000 [6:43:10<13:08:46, 14.29s/it]Train:  34%|███▍      | 1688/5000 [6:43:24<13:08:32, 14.29s/it]Train:  34%|███▍      | 1689/5000 [6:43:38<13:07:55, 14.28s/it]Train:  34%|███▍      | 1690/5000 [6:43:53<13:08:35, 14.29s/it]                                                               {'loss': 1.73833847, 'token_acc': 0.61793102, 'grad_norm': 0.23911957, 'learning_rate': 1.58e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069696, 'epoch': 0.34, 'global_step/max_steps': '1690/5000', 'percentage': '33.80%', 'elapsed_time': '6h 43m 53s', 'remaining_time': '13h 11m 2s'}
+Train:  34%|███▍      | 1690/5000 [6:43:53<13:08:35, 14.29s/it]Train:  34%|███▍      | 1690/5000 [6:43:53<13:08:35, 14.29s/it]Train:  34%|███▍      | 1691/5000 [6:44:07<13:08:13, 14.29s/it]Train:  34%|███▍      | 1692/5000 [6:44:21<13:07:27, 14.28s/it]Train:  34%|███▍      | 1693/5000 [6:44:36<13:07:41, 14.29s/it]Train:  34%|███▍      | 1694/5000 [6:44:50<13:07:45, 14.30s/it]Train:  34%|███▍      | 1695/5000 [6:45:04<13:06:51, 14.28s/it]Train:  34%|███▍      | 1696/5000 [6:45:18<13:07:00, 14.29s/it]Train:  34%|███▍      | 1697/5000 [6:45:33<13:07:27, 14.30s/it]Train:  34%|███▍      | 1698/5000 [6:45:47<13:06:22, 14.29s/it]Train:  34%|███▍      | 1699/5000 [6:46:01<13:05:45, 14.28s/it]Train:  34%|███▍      | 1700/5000 [6:46:16<13:05:26, 14.28s/it]                                                               {'loss': 1.75657711, 'token_acc': 0.61944084, 'grad_norm': 0.24390142, 'learning_rate': 1.574e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069698, 'epoch': 0.34, 'global_step/max_steps': '1700/5000', 'percentage': '34.00%', 'elapsed_time': '6h 46m 16s', 'remaining_time': '13h 8m 38s'}
+Train:  34%|███▍      | 1700/5000 [6:46:16<13:05:26, 14.28s/it]Train:  34%|███▍      | 1700/5000 [6:46:16<13:05:26, 14.28s/it]Train:  34%|███▍      | 1701/5000 [6:46:30<13:04:32, 14.27s/it]Train:  34%|███▍      | 1702/5000 [6:46:44<13:04:26, 14.27s/it]Train:  34%|███▍      | 1703/5000 [6:46:58<13:04:47, 14.28s/it]Train:  34%|███▍      | 1704/5000 [6:47:13<13:05:23, 14.30s/it]Train:  34%|███▍      | 1705/5000 [6:47:27<13:04:51, 14.29s/it]Train:  34%|███▍      | 1706/5000 [6:47:41<13:04:55, 14.30s/it]Train:  34%|███▍      | 1707/5000 [6:47:56<13:04:26, 14.29s/it]Train:  34%|███▍      | 1708/5000 [6:48:10<13:04:12, 14.29s/it]Train:  34%|███▍      | 1709/5000 [6:48:24<13:04:21, 14.30s/it]Train:  34%|███▍      | 1710/5000 [6:48:39<13:04:38, 14.31s/it]                                                               {'loss': 1.74579391, 'token_acc': 0.61354641, 'grad_norm': 0.23748779, 'learning_rate': 1.569e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069699, 'epoch': 0.34, 'global_step/max_steps': '1710/5000', 'percentage': '34.20%', 'elapsed_time': '6h 48m 39s', 'remaining_time': '13h 6m 14s'}
+Train:  34%|███▍      | 1710/5000 [6:48:39<13:04:38, 14.31s/it]Train:  34%|███▍      | 1710/5000 [6:48:39<13:04:38, 14.31s/it]Train:  34%|███▍      | 1711/5000 [6:48:53<13:03:56, 14.30s/it]Train:  34%|███▍      | 1712/5000 [6:49:07<13:03:39, 14.30s/it]Train:  34%|███▍      | 1713/5000 [6:49:21<13:03:01, 14.29s/it]Train:  34%|███▍      | 1714/5000 [6:49:36<13:01:48, 14.28s/it]Train:  34%|███▍      | 1715/5000 [6:49:50<13:00:58, 14.26s/it]Train:  34%|███▍      | 1716/5000 [6:50:04<13:01:44, 14.28s/it]Train:  34%|███▍      | 1717/5000 [6:50:18<13:01:32, 14.28s/it]Train:  34%|███▍      | 1718/5000 [6:50:33<13:01:30, 14.29s/it]Train:  34%|███▍      | 1719/5000 [6:50:47<13:01:51, 14.30s/it]Train:  34%|███▍      | 1720/5000 [6:51:01<13:01:53, 14.30s/it]                                                               {'loss': 1.74066505, 'token_acc': 0.61957529, 'grad_norm': 0.24120797, 'learning_rate': 1.563e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069701, 'epoch': 0.34, 'global_step/max_steps': '1720/5000', 'percentage': '34.40%', 'elapsed_time': '6h 51m 1s', 'remaining_time': '13h 3m 49s'}
+Train:  34%|███▍      | 1720/5000 [6:51:01<13:01:53, 14.30s/it]Train:  34%|███▍      | 1720/5000 [6:51:01<13:01:53, 14.30s/it]Train:  34%|███▍      | 1721/5000 [6:51:16<13:01:46, 14.31s/it]Train:  34%|███▍      | 1722/5000 [6:51:30<13:01:09, 14.30s/it]Train:  34%|███▍      | 1723/5000 [6:51:44<13:01:12, 14.30s/it]Train:  34%|███▍      | 1724/5000 [6:51:59<13:01:27, 14.31s/it]Train:  34%|███▍      | 1725/5000 [6:52:13<13:00:37, 14.30s/it]Train:  35%|███▍      | 1726/5000 [6:52:27<13:00:09, 14.30s/it]Train:  35%|███▍      | 1727/5000 [6:52:41<12:59:07, 14.28s/it]Train:  35%|███▍      | 1728/5000 [6:52:56<12:58:38, 14.28s/it]Train:  35%|███▍      | 1729/5000 [6:53:10<12:58:22, 14.28s/it]Train:  35%|███▍      | 1730/5000 [6:53:24<12:58:42, 14.29s/it]                                                               {'loss': 1.73886223, 'token_acc': 0.62472111, 'grad_norm': 0.2471038, 'learning_rate': 1.558e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069703, 'epoch': 0.35, 'global_step/max_steps': '1730/5000', 'percentage': '34.60%', 'elapsed_time': '6h 53m 24s', 'remaining_time': '13h 1m 25s'}
+Train:  35%|███▍      | 1730/5000 [6:53:24<12:58:42, 14.29s/it]Train:  35%|███▍      | 1730/5000 [6:53:24<12:58:42, 14.29s/it]Train:  35%|███▍      | 1731/5000 [6:53:39<12:58:12, 14.28s/it]Train:  35%|███▍      | 1732/5000 [6:53:53<12:57:31, 14.28s/it]Train:  35%|███▍      | 1733/5000 [6:54:07<12:57:02, 14.27s/it]Train:  35%|███▍      | 1734/5000 [6:54:21<12:56:57, 14.27s/it]Train:  35%|███▍      | 1735/5000 [6:54:36<12:56:49, 14.28s/it]Train:  35%|███▍      | 1736/5000 [6:54:50<12:57:18, 14.29s/it]Train:  35%|███▍      | 1737/5000 [6:55:04<12:57:09, 14.29s/it]Train:  35%|███▍      | 1738/5000 [6:55:19<12:57:39, 14.30s/it]Train:  35%|███▍      | 1739/5000 [6:55:33<12:57:16, 14.30s/it]Train:  35%|███▍      | 1740/5000 [6:55:47<12:56:43, 14.30s/it]                                                               {'loss': 1.74573364, 'token_acc': 0.61979581, 'grad_norm': 0.24800697, 'learning_rate': 1.552e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069704, 'epoch': 0.35, 'global_step/max_steps': '1740/5000', 'percentage': '34.80%', 'elapsed_time': '6h 55m 47s', 'remaining_time': '12h 59m 1s'}
+Train:  35%|███▍      | 1740/5000 [6:55:47<12:56:43, 14.30s/it]Train:  35%|███▍      | 1740/5000 [6:55:47<12:56:43, 14.30s/it]Train:  35%|███▍      | 1741/5000 [6:56:01<12:56:12, 14.29s/it]Train:  35%|███▍      | 1742/5000 [6:56:16<12:55:31, 14.28s/it]Train:  35%|███▍      | 1743/5000 [6:56:30<12:55:23, 14.28s/it]Train:  35%|███▍      | 1744/5000 [6:56:44<12:54:46, 14.28s/it]Train:  35%|███▍      | 1745/5000 [6:56:59<12:54:47, 14.28s/it]Train:  35%|███▍      | 1746/5000 [6:57:13<12:54:24, 14.28s/it]Train:  35%|███▍      | 1747/5000 [6:57:27<12:54:22, 14.28s/it]Train:  35%|███▍      | 1748/5000 [6:57:41<12:54:02, 14.28s/it]Train:  35%|███▍      | 1749/5000 [6:57:56<12:54:09, 14.29s/it]Train:  35%|███▌      | 1750/5000 [6:58:10<12:53:57, 14.29s/it]                                                               {'loss': 1.73896065, 'token_acc': 0.62497097, 'grad_norm': 0.2450434, 'learning_rate': 1.547e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069706, 'epoch': 0.35, 'global_step/max_steps': '1750/5000', 'percentage': '35.00%', 'elapsed_time': '6h 58m 10s', 'remaining_time': '12h 56m 36s'}
+Train:  35%|███▌      | 1750/5000 [6:58:10<12:53:57, 14.29s/it]Train:  35%|███▌      | 1750/5000 [6:58:10<12:53:57, 14.29s/it]Train:  35%|███▌      | 1751/5000 [6:58:24<12:53:40, 14.29s/it]Train:  35%|███▌      | 1752/5000 [6:58:39<12:53:48, 14.29s/it]Train:  35%|███▌      | 1753/5000 [6:58:53<12:53:27, 14.29s/it]Train:  35%|███▌      | 1754/5000 [6:59:07<12:52:49, 14.29s/it]Train:  35%|███▌      | 1755/5000 [6:59:21<12:51:42, 14.27s/it]Train:  35%|███▌      | 1756/5000 [6:59:36<12:51:05, 14.26s/it]Train:  35%|███▌      | 1757/5000 [6:59:50<12:51:08, 14.27s/it]Train:  35%|███▌      | 1758/5000 [7:00:04<12:50:30, 14.26s/it]Train:  35%|███▌      | 1759/5000 [7:00:18<12:50:34, 14.27s/it]Train:  35%|███▌      | 1760/5000 [7:00:33<12:49:44, 14.25s/it]                                                               {'loss': 1.74815845, 'token_acc': 0.62068941, 'grad_norm': 0.25006324, 'learning_rate': 1.541e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069708, 'epoch': 0.35, 'global_step/max_steps': '1760/5000', 'percentage': '35.20%', 'elapsed_time': '7h 0m 33s', 'remaining_time': '12h 54m 11s'}
+Train:  35%|███▌      | 1760/5000 [7:00:33<12:49:44, 14.25s/it]Train:  35%|███▌      | 1760/5000 [7:00:33<12:49:44, 14.25s/it]Train:  35%|███▌      | 1761/5000 [7:00:47<12:50:06, 14.27s/it]Train:  35%|███▌      | 1762/5000 [7:01:01<12:49:21, 14.26s/it]Train:  35%|███▌      | 1763/5000 [7:01:15<12:49:20, 14.26s/it]Train:  35%|███▌      | 1764/5000 [7:01:30<12:49:02, 14.26s/it]Train:  35%|███▌      | 1765/5000 [7:01:44<12:49:31, 14.27s/it]Train:  35%|███▌      | 1766/5000 [7:01:58<12:48:59, 14.27s/it]Train:  35%|███▌      | 1767/5000 [7:02:13<12:48:21, 14.26s/it]Train:  35%|███▌      | 1768/5000 [7:02:27<12:48:03, 14.26s/it]Train:  35%|███▌      | 1769/5000 [7:02:41<12:48:23, 14.27s/it]Train:  35%|███▌      | 1770/5000 [7:02:55<12:48:33, 14.28s/it]                                                               {'loss': 1.73401241, 'token_acc': 0.62085282, 'grad_norm': 0.24083947, 'learning_rate': 1.536e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06971, 'epoch': 0.35, 'global_step/max_steps': '1770/5000', 'percentage': '35.40%', 'elapsed_time': '7h 2m 55s', 'remaining_time': '12h 51m 47s'}
+Train:  35%|███▌      | 1770/5000 [7:02:55<12:48:33, 14.28s/it]Train:  35%|███▌      | 1770/5000 [7:02:55<12:48:33, 14.28s/it]Train:  35%|███▌      | 1771/5000 [7:03:10<12:48:29, 14.28s/it]Train:  35%|███▌      | 1772/5000 [7:03:24<12:48:05, 14.28s/it]Train:  35%|███▌      | 1773/5000 [7:03:38<12:47:22, 14.27s/it]Train:  35%|███▌      | 1774/5000 [7:03:52<12:46:27, 14.26s/it]Train:  36%|███▌      | 1775/5000 [7:04:07<12:46:17, 14.26s/it]Train:  36%|███▌      | 1776/5000 [7:04:21<12:46:32, 14.27s/it]Train:  36%|███▌      | 1777/5000 [7:04:35<12:45:27, 14.25s/it]Train:  36%|███▌      | 1778/5000 [7:04:49<12:45:48, 14.26s/it]Train:  36%|███▌      | 1779/5000 [7:05:04<12:45:21, 14.26s/it]Train:  36%|███▌      | 1780/5000 [7:05:18<12:45:23, 14.26s/it]                                                               {'loss': 1.72788773, 'token_acc': 0.62895625, 'grad_norm': 0.24601075, 'learning_rate': 1.53e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069713, 'epoch': 0.36, 'global_step/max_steps': '1780/5000', 'percentage': '35.60%', 'elapsed_time': '7h 5m 18s', 'remaining_time': '12h 49m 22s'}
+Train:  36%|███▌      | 1780/5000 [7:05:18<12:45:23, 14.26s/it]Train:  36%|███▌      | 1780/5000 [7:05:18<12:45:23, 14.26s/it]Train:  36%|███▌      | 1781/5000 [7:05:32<12:44:42, 14.25s/it]Train:  36%|███▌      | 1782/5000 [7:05:46<12:44:20, 14.25s/it]Train:  36%|███▌      | 1783/5000 [7:06:01<12:44:01, 14.25s/it]Train:  36%|███▌      | 1784/5000 [7:06:15<12:43:57, 14.25s/it]Train:  36%|███▌      | 1785/5000 [7:06:29<12:43:56, 14.26s/it]Train:  36%|███▌      | 1786/5000 [7:06:43<12:43:56, 14.26s/it]Train:  36%|███▌      | 1787/5000 [7:06:58<12:43:16, 14.25s/it]Train:  36%|███▌      | 1788/5000 [7:07:12<12:42:57, 14.25s/it]Train:  36%|███▌      | 1789/5000 [7:07:26<12:42:40, 14.25s/it]Train:  36%|███▌      | 1790/5000 [7:07:40<12:42:29, 14.25s/it]                                                               {'loss': 1.72430134, 'token_acc': 0.61862864, 'grad_norm': 0.24759319, 'learning_rate': 1.525e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069715, 'epoch': 0.36, 'global_step/max_steps': '1790/5000', 'percentage': '35.80%', 'elapsed_time': '7h 7m 40s', 'remaining_time': '12h 46m 57s'}
+Train:  36%|███▌      | 1790/5000 [7:07:40<12:42:29, 14.25s/it]Train:  36%|███▌      | 1790/5000 [7:07:40<12:42:29, 14.25s/it]Train:  36%|███▌      | 1791/5000 [7:07:55<12:42:49, 14.26s/it]Train:  36%|███▌      | 1792/5000 [7:08:09<12:43:08, 14.27s/it]Train:  36%|███▌      | 1793/5000 [7:08:23<12:42:50, 14.27s/it]Train:  36%|███▌      | 1794/5000 [7:08:38<12:43:13, 14.28s/it]Train:  36%|███▌      | 1795/5000 [7:08:52<12:45:23, 14.33s/it]Train:  36%|███▌      | 1796/5000 [7:09:06<12:44:36, 14.32s/it]Train:  36%|███▌      | 1797/5000 [7:09:21<12:43:29, 14.30s/it]Train:  36%|███▌      | 1798/5000 [7:09:35<12:42:23, 14.29s/it]Train:  36%|███▌      | 1799/5000 [7:09:49<12:44:45, 14.33s/it]Train:  36%|███▌      | 1800/5000 [7:10:04<12:43:46, 14.32s/it]                                                               {'loss': 1.73046017, 'token_acc': 0.61839963, 'grad_norm': 0.2455382, 'learning_rate': 1.519e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069716, 'epoch': 0.36, 'global_step/max_steps': '1800/5000', 'percentage': '36.00%', 'elapsed_time': '7h 10m 4s', 'remaining_time': '12h 44m 33s'}
+Train:  36%|███▌      | 1800/5000 [7:10:04<12:43:46, 14.32s/it]Train:  36%|███▌      | 1800/5000 [7:10:04<12:43:46, 14.32s/it]Train:  36%|███▌      | 1801/5000 [7:10:18<12:43:22, 14.32s/it]Train:  36%|███▌      | 1802/5000 [7:10:32<12:42:40, 14.31s/it]Train:  36%|███▌      | 1803/5000 [7:10:47<12:45:26, 14.37s/it]Train:  36%|███▌      | 1804/5000 [7:11:01<12:43:47, 14.34s/it]Train:  36%|███▌      | 1805/5000 [7:11:15<12:42:49, 14.33s/it]Train:  36%|███▌      | 1806/5000 [7:11:30<12:42:13, 14.32s/it]Train:  36%|███▌      | 1807/5000 [7:11:44<12:42:41, 14.33s/it]Train:  36%|███▌      | 1808/5000 [7:11:58<12:41:49, 14.32s/it]Train:  36%|███▌      | 1809/5000 [7:12:13<12:41:17, 14.31s/it]Train:  36%|███▌      | 1810/5000 [7:12:27<12:41:51, 14.33s/it]                                                               {'loss': 1.7468523, 'token_acc': 0.61857935, 'grad_norm': 0.24339065, 'learning_rate': 1.513e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069716, 'epoch': 0.36, 'global_step/max_steps': '1810/5000', 'percentage': '36.20%', 'elapsed_time': '7h 12m 27s', 'remaining_time': '12h 42m 10s'}
+Train:  36%|███▌      | 1810/5000 [7:12:27<12:41:51, 14.33s/it]Train:  36%|███▌      | 1810/5000 [7:12:27<12:41:51, 14.33s/it]Train:  36%|███▌      | 1811/5000 [7:12:41<12:41:07, 14.32s/it]Train:  36%|███▌      | 1812/5000 [7:12:55<12:40:22, 14.31s/it]Train:  36%|███▋      | 1813/5000 [7:13:10<12:40:24, 14.32s/it]Train:  36%|███▋      | 1814/5000 [7:13:24<12:39:37, 14.31s/it]Train:  36%|███▋      | 1815/5000 [7:13:38<12:39:42, 14.31s/it]Train:  36%|███▋      | 1816/5000 [7:13:53<12:39:34, 14.31s/it]Train:  36%|███▋      | 1817/5000 [7:14:07<12:38:56, 14.31s/it]Train:  36%|███▋      | 1818/5000 [7:14:21<12:38:29, 14.30s/it]Train:  36%|███▋      | 1819/5000 [7:14:36<12:38:09, 14.30s/it]Train:  36%|███▋      | 1820/5000 [7:14:50<12:37:48, 14.30s/it]                                                               {'loss': 1.74033852, 'token_acc': 0.61976764, 'grad_norm': 0.24221838, 'learning_rate': 1.508e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069718, 'epoch': 0.36, 'global_step/max_steps': '1820/5000', 'percentage': '36.40%', 'elapsed_time': '7h 14m 50s', 'remaining_time': '12h 39m 46s'}
+Train:  36%|███▋      | 1820/5000 [7:14:50<12:37:48, 14.30s/it]Train:  36%|███▋      | 1820/5000 [7:14:50<12:37:48, 14.30s/it]Train:  36%|███▋      | 1821/5000 [7:15:04<12:36:53, 14.29s/it]Train:  36%|███▋      | 1822/5000 [7:15:18<12:36:39, 14.29s/it]Train:  36%|███▋      | 1823/5000 [7:15:33<12:36:01, 14.28s/it]Train:  36%|███▋      | 1824/5000 [7:15:47<12:35:46, 14.28s/it]Train:  36%|███▋      | 1825/5000 [7:16:01<12:34:50, 14.26s/it]Train:  37%|███▋      | 1826/5000 [7:16:16<12:35:08, 14.27s/it]Train:  37%|███▋      | 1827/5000 [7:16:30<12:34:59, 14.28s/it]Train:  37%|███▋      | 1828/5000 [7:16:44<12:34:56, 14.28s/it]Train:  37%|███▋      | 1829/5000 [7:16:58<12:34:54, 14.28s/it]Train:  37%|███▋      | 1830/5000 [7:17:13<12:34:29, 14.28s/it]                                                               {'loss': 1.73276367, 'token_acc': 0.61777512, 'grad_norm': 0.23737761, 'learning_rate': 1.502e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069719, 'epoch': 0.37, 'global_step/max_steps': '1830/5000', 'percentage': '36.60%', 'elapsed_time': '7h 17m 13s', 'remaining_time': '12h 37m 22s'}
+Train:  37%|███▋      | 1830/5000 [7:17:13<12:34:29, 14.28s/it]Train:  37%|███▋      | 1830/5000 [7:17:13<12:34:29, 14.28s/it]Train:  37%|███▋      | 1831/5000 [7:17:27<12:34:27, 14.28s/it]Train:  37%|███▋      | 1832/5000 [7:17:41<12:34:05, 14.28s/it]Train:  37%|███▋      | 1833/5000 [7:17:55<12:33:23, 14.27s/it]Train:  37%|███▋      | 1834/5000 [7:18:10<12:33:06, 14.27s/it]Train:  37%|███▋      | 1835/5000 [7:18:24<12:33:04, 14.28s/it]Train:  37%|███▋      | 1836/5000 [7:18:38<12:32:30, 14.27s/it]Train:  37%|███▋      | 1837/5000 [7:18:53<12:32:55, 14.28s/it]Train:  37%|███▋      | 1838/5000 [7:19:07<12:32:52, 14.29s/it]Train:  37%|███▋      | 1839/5000 [7:19:21<12:32:16, 14.28s/it]Train:  37%|███▋      | 1840/5000 [7:19:35<12:31:59, 14.28s/it]                                                               {'loss': 1.7289814, 'token_acc': 0.62447539, 'grad_norm': 0.24905638, 'learning_rate': 1.496e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069721, 'epoch': 0.37, 'global_step/max_steps': '1840/5000', 'percentage': '36.80%', 'elapsed_time': '7h 19m 35s', 'remaining_time': '12h 34m 57s'}
+Train:  37%|███▋      | 1840/5000 [7:19:35<12:31:59, 14.28s/it]Train:  37%|███▋      | 1840/5000 [7:19:35<12:31:59, 14.28s/it]Train:  37%|███▋      | 1841/5000 [7:19:50<12:32:09, 14.29s/it]Train:  37%|███▋      | 1842/5000 [7:20:04<12:31:49, 14.28s/it]Train:  37%|███▋      | 1843/5000 [7:20:18<12:31:25, 14.28s/it]Train:  37%|███▋      | 1844/5000 [7:20:33<12:30:45, 14.27s/it]Train:  37%|███▋      | 1845/5000 [7:20:47<12:30:41, 14.28s/it]Train:  37%|███▋      | 1846/5000 [7:21:01<12:30:00, 14.27s/it]Train:  37%|███▋      | 1847/5000 [7:21:15<12:29:30, 14.26s/it]Train:  37%|███▋      | 1848/5000 [7:21:30<12:29:01, 14.26s/it]Train:  37%|███▋      | 1849/5000 [7:21:44<12:28:14, 14.25s/it]Train:  37%|███▋      | 1850/5000 [7:21:58<12:28:24, 14.26s/it]                                                               {'loss': 1.73951416, 'token_acc': 0.62389223, 'grad_norm': 0.24328235, 'learning_rate': 1.49e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069723, 'epoch': 0.37, 'global_step/max_steps': '1850/5000', 'percentage': '37.00%', 'elapsed_time': '7h 21m 58s', 'remaining_time': '12h 32m 33s'}
+Train:  37%|███▋      | 1850/5000 [7:21:58<12:28:24, 14.26s/it]Train:  37%|███▋      | 1850/5000 [7:21:58<12:28:24, 14.26s/it]Train:  37%|███▋      | 1851/5000 [7:22:12<12:28:37, 14.26s/it]Train:  37%|███▋      | 1852/5000 [7:22:27<12:28:00, 14.26s/it]Train:  37%|███▋      | 1853/5000 [7:22:41<12:28:00, 14.26s/it]Train:  37%|███▋      | 1854/5000 [7:22:55<12:27:52, 14.26s/it]Train:  37%|███▋      | 1855/5000 [7:23:09<12:27:26, 14.26s/it]Train:  37%|███▋      | 1856/5000 [7:23:24<12:27:29, 14.26s/it]Train:  37%|███▋      | 1857/5000 [7:23:38<12:27:14, 14.26s/it]Train:  37%|███▋      | 1858/5000 [7:23:52<12:26:36, 14.26s/it]Train:  37%|███▋      | 1859/5000 [7:24:06<12:26:29, 14.26s/it]Train:  37%|███▋      | 1860/5000 [7:24:21<12:25:58, 14.25s/it]                                                               {'loss': 1.73137131, 'token_acc': 0.62465588, 'grad_norm': 0.24105994, 'learning_rate': 1.485e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069725, 'epoch': 0.37, 'global_step/max_steps': '1860/5000', 'percentage': '37.20%', 'elapsed_time': '7h 24m 21s', 'remaining_time': '12h 30m 8s'}
+Train:  37%|███▋      | 1860/5000 [7:24:21<12:25:58, 14.25s/it]Train:  37%|███▋      | 1860/5000 [7:24:21<12:25:58, 14.25s/it]Train:  37%|███▋      | 1861/5000 [7:24:35<12:26:09, 14.26s/it]Train:  37%|███▋      | 1862/5000 [7:24:49<12:26:32, 14.27s/it]Train:  37%|███▋      | 1863/5000 [7:25:04<12:26:18, 14.27s/it]Train:  37%|███▋      | 1864/5000 [7:25:18<12:25:28, 14.26s/it]Train:  37%|███▋      | 1865/5000 [7:25:32<12:25:44, 14.27s/it]Train:  37%|███▋      | 1866/5000 [7:25:46<12:25:31, 14.27s/it]Train:  37%|███▋      | 1867/5000 [7:26:01<12:25:06, 14.27s/it]Train:  37%|███▋      | 1868/5000 [7:26:15<12:24:50, 14.27s/it]Train:  37%|███▋      | 1869/5000 [7:26:29<12:24:07, 14.26s/it]Train:  37%|███▋      | 1870/5000 [7:26:43<12:23:34, 14.25s/it]                                                               {'loss': 1.71843224, 'token_acc': 0.62571671, 'grad_norm': 0.24205504, 'learning_rate': 1.479e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069727, 'epoch': 0.37, 'global_step/max_steps': '1870/5000', 'percentage': '37.40%', 'elapsed_time': '7h 26m 43s', 'remaining_time': '12h 27m 44s'}
+Train:  37%|███▋      | 1870/5000 [7:26:43<12:23:34, 14.25s/it]Train:  37%|███▋      | 1870/5000 [7:26:43<12:23:34, 14.25s/it]Train:  37%|███▋      | 1871/5000 [7:26:58<12:24:19, 14.27s/it]Train:  37%|███▋      | 1872/5000 [7:27:12<12:23:42, 14.27s/it]Train:  37%|███▋      | 1873/5000 [7:27:26<12:23:50, 14.27s/it]Train:  37%|███▋      | 1874/5000 [7:27:40<12:23:30, 14.27s/it]Train:  38%|███▊      | 1875/5000 [7:27:55<12:22:56, 14.26s/it]Train:  38%|███▊      | 1876/5000 [7:28:09<12:22:12, 14.25s/it]Train:  38%|███▊      | 1877/5000 [7:28:23<12:22:11, 14.26s/it]Train:  38%|███▊      | 1878/5000 [7:28:37<12:21:49, 14.26s/it]Train:  38%|███▊      | 1879/5000 [7:28:52<12:22:24, 14.27s/it]Train:  38%|███▊      | 1880/5000 [7:29:06<12:22:35, 14.28s/it]                                                               {'loss': 1.73163223, 'token_acc': 0.61406349, 'grad_norm': 0.24032839, 'learning_rate': 1.473e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069729, 'epoch': 0.38, 'global_step/max_steps': '1880/5000', 'percentage': '37.60%', 'elapsed_time': '7h 29m 6s', 'remaining_time': '12h 25m 19s'}
+Train:  38%|███▊      | 1880/5000 [7:29:06<12:22:35, 14.28s/it]Train:  38%|███▊      | 1880/5000 [7:29:06<12:22:35, 14.28s/it]Train:  38%|███▊      | 1881/5000 [7:29:20<12:22:37, 14.29s/it]Train:  38%|███▊      | 1882/5000 [7:29:35<12:22:38, 14.29s/it]Train:  38%|███▊      | 1883/5000 [7:29:49<12:22:13, 14.29s/it]Train:  38%|███▊      | 1884/5000 [7:30:03<12:22:36, 14.30s/it]Train:  38%|███▊      | 1885/5000 [7:30:18<12:21:13, 14.28s/it]Train:  38%|███▊      | 1886/5000 [7:30:32<12:21:38, 14.29s/it]Train:  38%|███▊      | 1887/5000 [7:30:46<12:21:42, 14.30s/it]Train:  38%|███▊      | 1888/5000 [7:31:00<12:21:05, 14.29s/it]Train:  38%|███▊      | 1889/5000 [7:31:15<12:20:14, 14.28s/it]Train:  38%|███▊      | 1890/5000 [7:31:29<12:20:13, 14.28s/it]                                                               {'loss': 1.73202286, 'token_acc': 0.62305422, 'grad_norm': 0.23627166, 'learning_rate': 1.467e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06973, 'epoch': 0.38, 'global_step/max_steps': '1890/5000', 'percentage': '37.80%', 'elapsed_time': '7h 31m 29s', 'remaining_time': '12h 22m 55s'}
+Train:  38%|███▊      | 1890/5000 [7:31:29<12:20:13, 14.28s/it]Train:  38%|███▊      | 1890/5000 [7:31:29<12:20:13, 14.28s/it]Train:  38%|███▊      | 1891/5000 [7:31:43<12:19:57, 14.28s/it]Train:  38%|███▊      | 1892/5000 [7:31:58<12:19:48, 14.28s/it]Train:  38%|███▊      | 1893/5000 [7:32:12<12:20:00, 14.29s/it]Train:  38%|███▊      | 1894/5000 [7:32:26<12:18:52, 14.27s/it]Train:  38%|███▊      | 1895/5000 [7:32:40<12:18:35, 14.27s/it]Train:  38%|███▊      | 1896/5000 [7:32:55<12:18:54, 14.28s/it]Train:  38%|███▊      | 1897/5000 [7:33:09<12:18:23, 14.28s/it]Train:  38%|███▊      | 1898/5000 [7:33:23<12:18:01, 14.27s/it]Train:  38%|███▊      | 1899/5000 [7:33:37<12:17:28, 14.27s/it]Train:  38%|███▊      | 1900/5000 [7:33:52<12:17:29, 14.27s/it]                                                               {'loss': 1.71726437, 'token_acc': 0.6221127, 'grad_norm': 0.24366048, 'learning_rate': 1.461e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069732, 'epoch': 0.38, 'global_step/max_steps': '1900/5000', 'percentage': '38.00%', 'elapsed_time': '7h 33m 52s', 'remaining_time': '12h 20m 31s'}
+Train:  38%|███▊      | 1900/5000 [7:33:52<12:17:29, 14.27s/it]Train:  38%|███▊      | 1900/5000 [7:33:52<12:17:29, 14.27s/it]Train:  38%|███▊      | 1901/5000 [7:34:06<12:17:30, 14.28s/it]Train:  38%|███▊      | 1902/5000 [7:34:20<12:17:07, 14.28s/it]Train:  38%|███▊      | 1903/5000 [7:34:35<12:17:27, 14.29s/it]Train:  38%|███▊      | 1904/5000 [7:34:49<12:16:25, 14.27s/it]Train:  38%|███▊      | 1905/5000 [7:35:03<12:16:05, 14.27s/it]Train:  38%|███▊      | 1906/5000 [7:35:17<12:16:12, 14.28s/it]Train:  38%|███▊      | 1907/5000 [7:35:32<12:16:36, 14.29s/it]Train:  38%|███▊      | 1908/5000 [7:35:46<12:16:36, 14.29s/it]Train:  38%|███▊      | 1909/5000 [7:36:00<12:16:42, 14.30s/it]Train:  38%|███▊      | 1910/5000 [7:36:23<14:19:07, 16.68s/it]                                                               {'loss': 1.73907089, 'token_acc': 0.61969329, 'grad_norm': 0.2401915, 'learning_rate': 1.455e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069713, 'epoch': 1.0, 'global_step/max_steps': '1910/5000', 'percentage': '38.20%', 'elapsed_time': '7h 36m 23s', 'remaining_time': '12h 18m 20s'}
+Train:  38%|███▊      | 1910/5000 [7:36:23<14:19:07, 16.68s/it]Train:  38%|███▊      | 1910/5000 [7:36:23<14:19:07, 16.68s/it]Train:  38%|███▊      | 1911/5000 [7:36:37<13:41:38, 15.96s/it]Train:  38%|███▊      | 1912/5000 [7:36:51<13:15:42, 15.46s/it]Train:  38%|███▊      | 1913/5000 [7:37:05<12:57:49, 15.12s/it]Train:  38%|███▊      | 1914/5000 [7:37:20<12:45:06, 14.88s/it]Train:  38%|███▊      | 1915/5000 [7:37:34<12:36:09, 14.71s/it]Train:  38%|███▊      | 1916/5000 [7:37:48<12:30:04, 14.59s/it]Train:  38%|███▊      | 1917/5000 [7:38:03<12:25:45, 14.51s/it]Train:  38%|███▊      | 1918/5000 [7:38:17<12:21:59, 14.45s/it]Train:  38%|███▊      | 1919/5000 [7:38:31<12:19:42, 14.41s/it]Train:  38%|███▊      | 1920/5000 [7:38:46<12:17:16, 14.36s/it]                                                               {'loss': 1.72433968, 'token_acc': 0.62692328, 'grad_norm': 0.24536827, 'learning_rate': 1.45e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069714, 'epoch': 1.0, 'global_step/max_steps': '1920/5000', 'percentage': '38.40%', 'elapsed_time': '7h 38m 46s', 'remaining_time': '12h 15m 56s'}
+Train:  38%|███▊      | 1920/5000 [7:38:46<12:17:16, 14.36s/it]Train:  38%|███▊      | 1920/5000 [7:38:46<12:17:16, 14.36s/it]Train:  38%|███▊      | 1921/5000 [7:39:00<12:16:06, 14.34s/it]Train:  38%|███▊      | 1922/5000 [7:39:14<12:15:03, 14.33s/it]Train:  38%|███▊      | 1923/5000 [7:39:28<12:14:31, 14.32s/it]Train:  38%|███▊      | 1924/5000 [7:39:43<12:13:51, 14.31s/it]Train:  38%|███▊      | 1925/5000 [7:39:57<12:13:17, 14.31s/it]Train:  39%|███▊      | 1926/5000 [7:40:11<12:12:21, 14.29s/it]Train:  39%|███▊      | 1927/5000 [7:40:26<12:12:14, 14.30s/it]Train:  39%|███▊      | 1928/5000 [7:40:40<12:11:54, 14.30s/it]Train:  39%|███▊      | 1929/5000 [7:40:54<12:10:51, 14.28s/it]Train:  39%|███▊      | 1930/5000 [7:41:08<12:10:17, 14.27s/it]                                                               {'loss': 1.71949577, 'token_acc': 0.62894358, 'grad_norm': 0.24087878, 'learning_rate': 1.444e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069716, 'epoch': 1.0, 'global_step/max_steps': '1930/5000', 'percentage': '38.60%', 'elapsed_time': '7h 41m 8s', 'remaining_time': '12h 13m 32s'}
+Train:  39%|███▊      | 1930/5000 [7:41:08<12:10:17, 14.27s/it]Train:  39%|███▊      | 1930/5000 [7:41:08<12:10:17, 14.27s/it]Train:  39%|███▊      | 1931/5000 [7:41:23<12:09:20, 14.26s/it]Train:  39%|███▊      | 1932/5000 [7:41:37<12:09:04, 14.26s/it]Train:  39%|███▊      | 1933/5000 [7:41:51<12:08:38, 14.25s/it]Train:  39%|███▊      | 1934/5000 [7:42:05<12:08:21, 14.25s/it]Train:  39%|███▊      | 1935/5000 [7:42:20<12:08:12, 14.26s/it]Train:  39%|███▊      | 1936/5000 [7:42:34<12:07:40, 14.25s/it]Train:  39%|███▊      | 1937/5000 [7:42:48<12:07:15, 14.25s/it]Train:  39%|███▉      | 1938/5000 [7:43:02<12:06:45, 14.24s/it]Train:  39%|███▉      | 1939/5000 [7:43:17<12:06:59, 14.25s/it]Train:  39%|███▉      | 1940/5000 [7:43:31<12:07:15, 14.26s/it]                                                               {'loss': 1.72974091, 'token_acc': 0.62097793, 'grad_norm': 0.24569236, 'learning_rate': 1.438e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069718, 'epoch': 1.01, 'global_step/max_steps': '1940/5000', 'percentage': '38.80%', 'elapsed_time': '7h 43m 31s', 'remaining_time': '12h 11m 7s'}
+Train:  39%|███▉      | 1940/5000 [7:43:31<12:07:15, 14.26s/it]Train:  39%|███▉      | 1940/5000 [7:43:31<12:07:15, 14.26s/it]Train:  39%|███▉      | 1941/5000 [7:43:45<12:07:29, 14.27s/it]Train:  39%|███▉      | 1942/5000 [7:43:59<12:07:08, 14.27s/it]Train:  39%|███▉      | 1943/5000 [7:44:14<12:06:33, 14.26s/it]Train:  39%|███▉      | 1944/5000 [7:44:28<12:06:03, 14.26s/it]Train:  39%|███▉      | 1945/5000 [7:44:42<12:06:12, 14.26s/it]Train:  39%|███▉      | 1946/5000 [7:44:57<12:06:06, 14.27s/it]Train:  39%|███▉      | 1947/5000 [7:45:11<12:05:44, 14.26s/it]Train:  39%|███▉      | 1948/5000 [7:45:25<12:05:24, 14.26s/it]Train:  39%|███▉      | 1949/5000 [7:45:39<12:06:17, 14.28s/it]Train:  39%|███▉      | 1950/5000 [7:45:54<12:05:56, 14.28s/it]                                                               {'loss': 1.73213043, 'token_acc': 0.6265291, 'grad_norm': 0.23850413, 'learning_rate': 1.432e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06972, 'epoch': 1.01, 'global_step/max_steps': '1950/5000', 'percentage': '39.00%', 'elapsed_time': '7h 45m 54s', 'remaining_time': '12h 8m 43s'}
+Train:  39%|███▉      | 1950/5000 [7:45:54<12:05:56, 14.28s/it]Train:  39%|███▉      | 1950/5000 [7:45:54<12:05:56, 14.28s/it]Train:  39%|███▉      | 1951/5000 [7:46:08<12:06:02, 14.29s/it]Train:  39%|███▉      | 1952/5000 [7:46:22<12:05:45, 14.29s/it]Train:  39%|███▉      | 1953/5000 [7:46:36<12:05:10, 14.28s/it]Train:  39%|███▉      | 1954/5000 [7:46:51<12:04:10, 14.26s/it]Train:  39%|███▉      | 1955/5000 [7:47:05<12:03:25, 14.25s/it]Train:  39%|███▉      | 1956/5000 [7:47:19<12:03:22, 14.26s/it]Train:  39%|███▉      | 1957/5000 [7:47:33<12:02:52, 14.25s/it]Train:  39%|███▉      | 1958/5000 [7:47:48<12:02:20, 14.25s/it]Train:  39%|███▉      | 1959/5000 [7:48:02<12:01:55, 14.24s/it]Train:  39%|███▉      | 1960/5000 [7:48:16<12:01:33, 14.24s/it]                                                               {'loss': 1.72522087, 'token_acc': 0.61364479, 'grad_norm': 0.23969309, 'learning_rate': 1.426e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069722, 'epoch': 1.01, 'global_step/max_steps': '1960/5000', 'percentage': '39.20%', 'elapsed_time': '7h 48m 16s', 'remaining_time': '12h 6m 18s'}
+Train:  39%|███▉      | 1960/5000 [7:48:16<12:01:33, 14.24s/it]Train:  39%|███▉      | 1960/5000 [7:48:16<12:01:33, 14.24s/it]Train:  39%|███▉      | 1961/5000 [7:48:30<12:01:00, 14.24s/it]Train:  39%|███▉      | 1962/5000 [7:48:45<12:00:43, 14.23s/it]Train:  39%|███▉      | 1963/5000 [7:48:59<12:00:25, 14.23s/it]Train:  39%|███▉      | 1964/5000 [7:49:13<11:59:43, 14.22s/it]Train:  39%|███▉      | 1965/5000 [7:49:27<11:59:57, 14.23s/it]Train:  39%|███▉      | 1966/5000 [7:49:42<11:59:36, 14.23s/it]Train:  39%|███▉      | 1967/5000 [7:49:56<11:59:46, 14.24s/it]Train:  39%|███▉      | 1968/5000 [7:50:10<11:59:26, 14.24s/it]Train:  39%|███▉      | 1969/5000 [7:50:24<12:00:47, 14.27s/it]Train:  39%|███▉      | 1970/5000 [7:50:39<12:00:05, 14.26s/it]                                                               {'loss': 1.72335415, 'token_acc': 0.62311616, 'grad_norm': 0.24162158, 'learning_rate': 1.42e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069725, 'epoch': 1.01, 'global_step/max_steps': '1970/5000', 'percentage': '39.40%', 'elapsed_time': '7h 50m 39s', 'remaining_time': '12h 3m 53s'}
+Train:  39%|███▉      | 1970/5000 [7:50:39<12:00:05, 14.26s/it]Train:  39%|███▉      | 1970/5000 [7:50:39<12:00:05, 14.26s/it]Train:  39%|███▉      | 1971/5000 [7:50:53<11:59:17, 14.25s/it]Train:  39%|███▉      | 1972/5000 [7:51:07<11:59:06, 14.25s/it]Train:  39%|███▉      | 1973/5000 [7:51:21<11:59:22, 14.26s/it]Train:  39%|███▉      | 1974/5000 [7:51:36<11:59:19, 14.26s/it]Train:  40%|███▉      | 1975/5000 [7:51:50<11:59:04, 14.26s/it]Train:  40%|███▉      | 1976/5000 [7:52:04<11:58:49, 14.26s/it]Train:  40%|███▉      | 1977/5000 [7:52:18<11:59:29, 14.28s/it]Train:  40%|███▉      | 1978/5000 [7:52:33<11:59:00, 14.28s/it]Train:  40%|███▉      | 1979/5000 [7:52:47<11:59:01, 14.28s/it]Train:  40%|███▉      | 1980/5000 [7:53:01<11:58:32, 14.28s/it]                                                               {'loss': 1.72887383, 'token_acc': 0.62395897, 'grad_norm': 0.24637318, 'learning_rate': 1.414e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069726, 'epoch': 1.01, 'global_step/max_steps': '1980/5000', 'percentage': '39.60%', 'elapsed_time': '7h 53m 1s', 'remaining_time': '12h 1m 29s'}
+Train:  40%|███▉      | 1980/5000 [7:53:01<11:58:32, 14.28s/it]Train:  40%|███▉      | 1980/5000 [7:53:01<11:58:32, 14.28s/it]Train:  40%|███▉      | 1981/5000 [7:53:16<11:59:08, 14.29s/it]Train:  40%|███▉      | 1982/5000 [7:53:30<11:58:45, 14.29s/it]Train:  40%|███▉      | 1983/5000 [7:53:44<11:58:52, 14.30s/it]Train:  40%|███▉      | 1984/5000 [7:53:59<11:58:39, 14.30s/it]Train:  40%|███▉      | 1985/5000 [7:54:13<11:58:12, 14.29s/it]Train:  40%|███▉      | 1986/5000 [7:54:27<11:57:54, 14.29s/it]Train:  40%|███▉      | 1987/5000 [7:54:41<11:57:53, 14.30s/it]Train:  40%|███▉      | 1988/5000 [7:54:56<11:57:18, 14.29s/it]Train:  40%|███▉      | 1989/5000 [7:55:10<11:56:57, 14.29s/it]Train:  40%|███▉      | 1990/5000 [7:55:24<11:56:58, 14.29s/it]                                                               {'loss': 1.72297859, 'token_acc': 0.62448989, 'grad_norm': 0.24425344, 'learning_rate': 1.408e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069727, 'epoch': 1.02, 'global_step/max_steps': '1990/5000', 'percentage': '39.80%', 'elapsed_time': '7h 55m 24s', 'remaining_time': '11h 59m 5s'}
+Train:  40%|███▉      | 1990/5000 [7:55:24<11:56:58, 14.29s/it]Train:  40%|███▉      | 1990/5000 [7:55:24<11:56:58, 14.29s/it]Train:  40%|███▉      | 1991/5000 [7:55:39<11:56:54, 14.30s/it]Train:  40%|███▉      | 1992/5000 [7:55:53<11:56:54, 14.30s/it]Train:  40%|███▉      | 1993/5000 [7:56:07<11:55:54, 14.28s/it]Train:  40%|███▉      | 1994/5000 [7:56:21<11:54:57, 14.27s/it]Train:  40%|███▉      | 1995/5000 [7:56:36<11:54:22, 14.26s/it]Train:  40%|███▉      | 1996/5000 [7:56:50<11:54:21, 14.27s/it]Train:  40%|███▉      | 1997/5000 [7:57:04<11:53:26, 14.25s/it]Train:  40%|███▉      | 1998/5000 [7:57:18<11:53:42, 14.26s/it]Train:  40%|███▉      | 1999/5000 [7:57:33<11:53:28, 14.26s/it]Train:  40%|████      | 2000/5000 [7:57:47<11:53:19, 14.27s/it]                                                               {'loss': 1.7153511, 'token_acc': 0.61914074, 'grad_norm': 0.24053566, 'learning_rate': 1.402e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069729, 'epoch': 1.02, 'global_step/max_steps': '2000/5000', 'percentage': '40.00%', 'elapsed_time': '7h 57m 47s', 'remaining_time': '11h 56m 41s'}
+Train:  40%|████      | 2000/5000 [7:57:47<11:53:19, 14.27s/it]Train:  40%|████      | 2000/5000 [7:57:47<11:53:19, 14.27s/it]                                                               {'eval_loss': 1.46200573, 'eval_token_acc': 0.66208071, 'eval_runtime': 42.3496, 'eval_samples_per_second': 0.331, 'eval_steps_per_second': 0.024, 'epoch': 1.02, 'global_step/max_steps': '2000/5000', 'percentage': '40.00%', 'elapsed_time': '7h 58m 29s', 'remaining_time': '11h 57m 44s'}
+Train:  40%|████      | 2000/5000 [7:58:29<11:53:19, 14.27s/it]Train:  40%|████      | 2000/5000 [7:58:29<11:53:19, 14.27s/it][INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/checkpoint-2000
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  40%|████      | 2001/5000 [7:59:31<34:12:24, 41.06s/it]Train:  40%|████      | 2002/5000 [7:59:45<27:30:13, 33.03s/it]Train:  40%|████      | 2003/5000 [7:59:59<22:48:26, 27.40s/it]Train:  40%|████      | 2004/5000 [8:00:13<19:31:36, 23.46s/it]Train:  40%|████      | 2005/5000 [8:00:28<17:14:25, 20.72s/it]Train:  40%|████      | 2006/5000 [8:00:42<15:38:09, 18.80s/it]Train:  40%|████      | 2007/5000 [8:00:56<14:30:27, 17.45s/it]Train:  40%|████      | 2008/5000 [8:01:11<13:43:26, 16.51s/it]Train:  40%|████      | 2009/5000 [8:01:25<13:10:48, 15.86s/it]Train:  40%|████      | 2010/5000 [8:01:39<12:47:19, 15.40s/it]                                                               {'loss': 1.72954025, 'token_acc': 0.62349798, 'grad_norm': 0.24572314, 'learning_rate': 1.396e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069515, 'epoch': 1.02, 'global_step/max_steps': '2010/5000', 'percentage': '40.20%', 'elapsed_time': '8h 1m 39s', 'remaining_time': '11h 56m 30s'}
+Train:  40%|████      | 2010/5000 [8:01:39<12:47:19, 15.40s/it]Train:  40%|████      | 2010/5000 [8:01:39<12:47:19, 15.40s/it]Train:  40%|████      | 2011/5000 [8:01:54<12:30:56, 15.07s/it]Train:  40%|████      | 2012/5000 [8:02:08<12:19:23, 14.85s/it]Train:  40%|████      | 2013/5000 [8:02:22<12:11:29, 14.69s/it]Train:  40%|████      | 2014/5000 [8:02:37<12:05:06, 14.57s/it]Train:  40%|████      | 2015/5000 [8:02:51<12:01:00, 14.49s/it]Train:  40%|████      | 2016/5000 [8:03:05<11:57:52, 14.43s/it]Train:  40%|████      | 2017/5000 [8:03:19<11:56:11, 14.41s/it]Train:  40%|████      | 2018/5000 [8:03:34<11:54:22, 14.37s/it]Train:  40%|████      | 2019/5000 [8:03:48<11:52:59, 14.35s/it]Train:  40%|████      | 2020/5000 [8:04:02<11:52:16, 14.34s/it]                                                               {'loss': 1.70868492, 'token_acc': 0.62152301, 'grad_norm': 0.23453923, 'learning_rate': 1.39e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069517, 'epoch': 1.02, 'global_step/max_steps': '2020/5000', 'percentage': '40.40%', 'elapsed_time': '8h 4m 2s', 'remaining_time': '11h 54m 5s'}
+Train:  40%|████      | 2020/5000 [8:04:02<11:52:16, 14.34s/it]Train:  40%|████      | 2020/5000 [8:04:02<11:52:16, 14.34s/it]Train:  40%|████      | 2021/5000 [8:04:17<11:51:45, 14.34s/it]Train:  40%|████      | 2022/5000 [8:04:31<11:50:59, 14.32s/it]Train:  40%|████      | 2023/5000 [8:04:45<11:50:32, 14.32s/it]Train:  40%|████      | 2024/5000 [8:05:00<11:50:10, 14.32s/it]Train:  40%|████      | 2025/5000 [8:05:14<11:49:27, 14.31s/it]Train:  41%|████      | 2026/5000 [8:05:28<11:48:11, 14.29s/it]Train:  41%|████      | 2027/5000 [8:05:42<11:48:06, 14.29s/it]Train:  41%|████      | 2028/5000 [8:05:57<11:48:19, 14.30s/it]Train:  41%|████      | 2029/5000 [8:06:11<11:47:52, 14.30s/it]Train:  41%|████      | 2030/5000 [8:06:25<11:47:59, 14.30s/it]                                                               {'loss': 1.70642395, 'token_acc': 0.62228335, 'grad_norm': 0.2413934, 'learning_rate': 1.383e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069519, 'epoch': 1.02, 'global_step/max_steps': '2030/5000', 'percentage': '40.60%', 'elapsed_time': '8h 6m 25s', 'remaining_time': '11h 51m 40s'}
+Train:  41%|████      | 2030/5000 [8:06:25<11:47:59, 14.30s/it]Train:  41%|████      | 2030/5000 [8:06:25<11:47:59, 14.30s/it]Train:  41%|████      | 2031/5000 [8:06:40<11:48:11, 14.31s/it]Train:  41%|████      | 2032/5000 [8:06:54<11:48:07, 14.32s/it]Train:  41%|████      | 2033/5000 [8:07:08<11:47:47, 14.31s/it]Train:  41%|████      | 2034/5000 [8:07:23<11:46:39, 14.30s/it]Train:  41%|████      | 2035/5000 [8:07:37<11:46:00, 14.29s/it]Train:  41%|████      | 2036/5000 [8:07:51<11:45:28, 14.28s/it]Train:  41%|████      | 2037/5000 [8:08:05<11:45:33, 14.29s/it]Train:  41%|████      | 2038/5000 [8:08:20<11:44:54, 14.28s/it]Train:  41%|████      | 2039/5000 [8:08:34<11:45:12, 14.29s/it]Train:  41%|████      | 2040/5000 [8:08:48<11:44:54, 14.29s/it]                                                               {'loss': 1.71422882, 'token_acc': 0.62806008, 'grad_norm': 0.24148215, 'learning_rate': 1.377e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069521, 'epoch': 1.03, 'global_step/max_steps': '2040/5000', 'percentage': '40.80%', 'elapsed_time': '8h 8m 48s', 'remaining_time': '11h 49m 15s'}
+Train:  41%|████      | 2040/5000 [8:08:48<11:44:54, 14.29s/it]Train:  41%|████      | 2040/5000 [8:08:48<11:44:54, 14.29s/it]Train:  41%|████      | 2041/5000 [8:09:03<11:44:47, 14.29s/it]Train:  41%|████      | 2042/5000 [8:09:17<11:44:36, 14.29s/it]Train:  41%|████      | 2043/5000 [8:09:31<11:43:34, 14.28s/it]Train:  41%|████      | 2044/5000 [8:09:45<11:43:21, 14.28s/it]Train:  41%|████      | 2045/5000 [8:10:00<11:43:34, 14.29s/it]Train:  41%|████      | 2046/5000 [8:10:14<11:43:11, 14.28s/it]Train:  41%|████      | 2047/5000 [8:10:28<11:42:37, 14.28s/it]Train:  41%|████      | 2048/5000 [8:10:43<11:42:09, 14.27s/it]Train:  41%|████      | 2049/5000 [8:10:57<11:42:32, 14.28s/it]Train:  41%|████      | 2050/5000 [8:11:11<11:42:08, 14.28s/it]                                                               {'loss': 1.7153841, 'token_acc': 0.62661826, 'grad_norm': 0.24797735, 'learning_rate': 1.371e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069523, 'epoch': 1.03, 'global_step/max_steps': '2050/5000', 'percentage': '41.00%', 'elapsed_time': '8h 11m 11s', 'remaining_time': '11h 46m 50s'}
+Train:  41%|████      | 2050/5000 [8:11:11<11:42:08, 14.28s/it]Train:  41%|████      | 2050/5000 [8:11:11<11:42:08, 14.28s/it]Train:  41%|████      | 2051/5000 [8:11:25<11:42:01, 14.28s/it]Train:  41%|████      | 2052/5000 [8:11:40<11:41:52, 14.29s/it]Train:  41%|████      | 2053/5000 [8:11:54<11:41:48, 14.29s/it]Train:  41%|████      | 2054/5000 [8:12:08<11:42:00, 14.30s/it]Train:  41%|████      | 2055/5000 [8:12:23<11:41:48, 14.30s/it]Train:  41%|████      | 2056/5000 [8:12:37<11:41:19, 14.29s/it]Train:  41%|████      | 2057/5000 [8:12:51<11:40:35, 14.28s/it]Train:  41%|████      | 2058/5000 [8:13:05<11:41:00, 14.30s/it]Train:  41%|████      | 2059/5000 [8:13:20<11:40:24, 14.29s/it]Train:  41%|████      | 2060/5000 [8:13:34<11:39:56, 14.28s/it]                                                               {'loss': 1.69724693, 'token_acc': 0.62554972, 'grad_norm': 0.24583441, 'learning_rate': 1.365e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069525, 'epoch': 1.03, 'global_step/max_steps': '2060/5000', 'percentage': '41.20%', 'elapsed_time': '8h 13m 34s', 'remaining_time': '11h 44m 25s'}
+Train:  41%|████      | 2060/5000 [8:13:34<11:39:56, 14.28s/it]Train:  41%|████      | 2060/5000 [8:13:34<11:39:56, 14.28s/it]Train:  41%|████      | 2061/5000 [8:13:48<11:38:52, 14.27s/it]Train:  41%|████      | 2062/5000 [8:14:03<11:38:46, 14.27s/it]Train:  41%|████▏     | 2063/5000 [8:14:17<11:38:39, 14.27s/it]Train:  41%|████▏     | 2064/5000 [8:14:31<11:39:20, 14.29s/it]Train:  41%|████▏     | 2065/5000 [8:14:45<11:38:51, 14.29s/it]Train:  41%|████▏     | 2066/5000 [8:15:00<11:38:50, 14.29s/it]Train:  41%|████▏     | 2067/5000 [8:15:14<11:38:14, 14.28s/it]Train:  41%|████▏     | 2068/5000 [8:15:28<11:38:04, 14.29s/it]Train:  41%|████▏     | 2069/5000 [8:15:43<11:37:25, 14.28s/it]Train:  41%|████▏     | 2070/5000 [8:15:57<11:37:21, 14.28s/it]                                                               {'loss': 1.71803627, 'token_acc': 0.63186792, 'grad_norm': 0.24912457, 'learning_rate': 1.359e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069528, 'epoch': 1.03, 'global_step/max_steps': '2070/5000', 'percentage': '41.40%', 'elapsed_time': '8h 15m 57s', 'remaining_time': '11h 42m 0s'}
+Train:  41%|████▏     | 2070/5000 [8:15:57<11:37:21, 14.28s/it]Train:  41%|████▏     | 2070/5000 [8:15:57<11:37:21, 14.28s/it]Train:  41%|████▏     | 2071/5000 [8:16:11<11:37:18, 14.28s/it]Train:  41%|████▏     | 2072/5000 [8:16:25<11:37:24, 14.29s/it]Train:  41%|████▏     | 2073/5000 [8:16:40<11:36:33, 14.28s/it]Train:  41%|████▏     | 2074/5000 [8:16:54<11:36:24, 14.28s/it]Train:  42%|████▏     | 2075/5000 [8:17:08<11:35:53, 14.27s/it]Train:  42%|████▏     | 2076/5000 [8:17:22<11:35:51, 14.28s/it]Train:  42%|████▏     | 2077/5000 [8:17:37<11:36:25, 14.30s/it]Train:  42%|████▏     | 2078/5000 [8:17:51<11:36:07, 14.29s/it]Train:  42%|████▏     | 2079/5000 [8:18:05<11:35:48, 14.29s/it]Train:  42%|████▏     | 2080/5000 [8:18:20<11:35:47, 14.30s/it]                                                               {'loss': 1.70942116, 'token_acc': 0.62671032, 'grad_norm': 0.242061, 'learning_rate': 1.353e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06953, 'epoch': 1.03, 'global_step/max_steps': '2080/5000', 'percentage': '41.60%', 'elapsed_time': '8h 18m 20s', 'remaining_time': '11h 39m 35s'}
+Train:  42%|████▏     | 2080/5000 [8:18:20<11:35:47, 14.30s/it]Train:  42%|████▏     | 2080/5000 [8:18:20<11:35:47, 14.30s/it]Train:  42%|████▏     | 2081/5000 [8:18:34<11:35:22, 14.29s/it]Train:  42%|████▏     | 2082/5000 [8:18:48<11:35:03, 14.29s/it]Train:  42%|████▏     | 2083/5000 [8:19:03<11:34:54, 14.29s/it]Train:  42%|████▏     | 2084/5000 [8:19:17<11:34:32, 14.29s/it]Train:  42%|████▏     | 2085/5000 [8:19:31<11:34:38, 14.30s/it]Train:  42%|████▏     | 2086/5000 [8:19:45<11:34:23, 14.30s/it]Train:  42%|████▏     | 2087/5000 [8:20:00<11:34:05, 14.30s/it]Train:  42%|████▏     | 2088/5000 [8:20:14<11:32:46, 14.27s/it]Train:  42%|████▏     | 2089/5000 [8:20:28<11:32:22, 14.27s/it]Train:  42%|████▏     | 2090/5000 [8:20:42<11:31:31, 14.26s/it]                                                               {'loss': 1.71031494, 'token_acc': 0.62542684, 'grad_norm': 0.25239292, 'learning_rate': 1.347e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069532, 'epoch': 1.04, 'global_step/max_steps': '2090/5000', 'percentage': '41.80%', 'elapsed_time': '8h 20m 42s', 'remaining_time': '11h 37m 10s'}
+Train:  42%|████▏     | 2090/5000 [8:20:42<11:31:31, 14.26s/it]Train:  42%|████▏     | 2090/5000 [8:20:42<11:31:31, 14.26s/it]Train:  42%|████▏     | 2091/5000 [8:20:57<11:31:13, 14.26s/it]Train:  42%|████▏     | 2092/5000 [8:21:11<11:31:27, 14.27s/it]Train:  42%|████▏     | 2093/5000 [8:21:25<11:31:11, 14.27s/it]Train:  42%|████▏     | 2094/5000 [8:21:40<11:31:14, 14.27s/it]Train:  42%|████▏     | 2095/5000 [8:21:54<11:31:01, 14.27s/it]Train:  42%|████▏     | 2096/5000 [8:22:08<11:30:27, 14.27s/it]Train:  42%|████▏     | 2097/5000 [8:22:22<11:30:00, 14.26s/it]Train:  42%|████▏     | 2098/5000 [8:22:37<11:29:28, 14.26s/it]Train:  42%|████▏     | 2099/5000 [8:22:51<11:29:20, 14.26s/it]Train:  42%|████▏     | 2100/5000 [8:23:05<11:29:07, 14.26s/it]                                                               {'loss': 1.70365829, 'token_acc': 0.62673294, 'grad_norm': 0.23603646, 'learning_rate': 1.34e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069535, 'epoch': 1.04, 'global_step/max_steps': '2100/5000', 'percentage': '42.00%', 'elapsed_time': '8h 23m 5s', 'remaining_time': '11h 34m 44s'}
+Train:  42%|████▏     | 2100/5000 [8:23:05<11:29:07, 14.26s/it]Train:  42%|████▏     | 2100/5000 [8:23:05<11:29:07, 14.26s/it]Train:  42%|████▏     | 2101/5000 [8:23:19<11:29:33, 14.27s/it]Train:  42%|████▏     | 2102/5000 [8:23:34<11:28:51, 14.26s/it]Train:  42%|████▏     | 2103/5000 [8:23:48<11:28:27, 14.26s/it]Train:  42%|████▏     | 2104/5000 [8:24:02<11:29:20, 14.28s/it]Train:  42%|████▏     | 2105/5000 [8:24:17<11:28:49, 14.28s/it]Train:  42%|████▏     | 2106/5000 [8:24:31<11:28:52, 14.28s/it]Train:  42%|████▏     | 2107/5000 [8:24:45<11:28:40, 14.28s/it]Train:  42%|████▏     | 2108/5000 [8:24:59<11:28:14, 14.28s/it]Train:  42%|████▏     | 2109/5000 [8:25:14<11:27:45, 14.27s/it]Train:  42%|████▏     | 2110/5000 [8:25:28<11:27:16, 14.27s/it]                                                               {'loss': 1.70229492, 'token_acc': 0.62278301, 'grad_norm': 0.23169881, 'learning_rate': 1.334e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069538, 'epoch': 1.04, 'global_step/max_steps': '2110/5000', 'percentage': '42.20%', 'elapsed_time': '8h 25m 28s', 'remaining_time': '11h 32m 19s'}
+Train:  42%|████▏     | 2110/5000 [8:25:28<11:27:16, 14.27s/it]Train:  42%|████▏     | 2110/5000 [8:25:28<11:27:16, 14.27s/it]Train:  42%|████▏     | 2111/5000 [8:25:42<11:27:09, 14.27s/it]Train:  42%|████▏     | 2112/5000 [8:25:56<11:27:06, 14.28s/it]Train:  42%|████▏     | 2113/5000 [8:26:11<11:26:38, 14.27s/it]Train:  42%|████▏     | 2114/5000 [8:26:25<11:26:25, 14.27s/it]Train:  42%|████▏     | 2115/5000 [8:26:39<11:26:30, 14.28s/it]Train:  42%|████▏     | 2116/5000 [8:26:54<11:27:22, 14.30s/it]Train:  42%|████▏     | 2117/5000 [8:27:08<11:26:18, 14.28s/it]Train:  42%|████▏     | 2118/5000 [8:27:22<11:25:43, 14.28s/it]Train:  42%|████▏     | 2119/5000 [8:27:36<11:25:04, 14.27s/it]Train:  42%|████▏     | 2120/5000 [8:27:51<11:24:08, 14.25s/it]                                                               {'loss': 1.68975029, 'token_acc': 0.62971853, 'grad_norm': 0.23850967, 'learning_rate': 1.328e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06954, 'epoch': 1.04, 'global_step/max_steps': '2120/5000', 'percentage': '42.40%', 'elapsed_time': '8h 27m 51s', 'remaining_time': '11h 29m 54s'}
+Train:  42%|████▏     | 2120/5000 [8:27:51<11:24:08, 14.25s/it]Train:  42%|████▏     | 2120/5000 [8:27:51<11:24:08, 14.25s/it]Train:  42%|████▏     | 2121/5000 [8:28:05<11:24:03, 14.26s/it]Train:  42%|████▏     | 2122/5000 [8:28:19<11:24:01, 14.26s/it]Train:  42%|████▏     | 2123/5000 [8:28:33<11:23:48, 14.26s/it]Train:  42%|████▏     | 2124/5000 [8:28:48<11:23:19, 14.26s/it]Train:  42%|████▎     | 2125/5000 [8:29:02<11:22:54, 14.25s/it]Train:  43%|████▎     | 2126/5000 [8:29:16<11:22:33, 14.25s/it]Train:  43%|████▎     | 2127/5000 [8:29:30<11:22:52, 14.26s/it]Train:  43%|████▎     | 2128/5000 [8:29:45<11:22:53, 14.27s/it]Train:  43%|████▎     | 2129/5000 [8:29:59<11:23:14, 14.28s/it]Train:  43%|████▎     | 2130/5000 [8:30:13<11:22:42, 14.27s/it]                                                               {'loss': 1.69765892, 'token_acc': 0.62502184, 'grad_norm': 0.24132949, 'learning_rate': 1.322e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069543, 'epoch': 1.04, 'global_step/max_steps': '2130/5000', 'percentage': '42.60%', 'elapsed_time': '8h 30m 13s', 'remaining_time': '11h 27m 29s'}
+Train:  43%|████▎     | 2130/5000 [8:30:13<11:22:42, 14.27s/it]Train:  43%|████▎     | 2130/5000 [8:30:13<11:22:42, 14.27s/it]Train:  43%|████▎     | 2131/5000 [8:30:27<11:22:04, 14.26s/it]Train:  43%|████▎     | 2132/5000 [8:30:42<11:21:58, 14.27s/it]Train:  43%|████▎     | 2133/5000 [8:30:56<11:21:21, 14.26s/it]Train:  43%|████▎     | 2134/5000 [8:31:10<11:21:38, 14.27s/it]Train:  43%|████▎     | 2135/5000 [8:31:25<11:22:02, 14.28s/it]Train:  43%|████▎     | 2136/5000 [8:31:39<11:21:45, 14.28s/it]Train:  43%|████▎     | 2137/5000 [8:31:53<11:21:05, 14.27s/it]Train:  43%|████▎     | 2138/5000 [8:32:07<11:20:54, 14.27s/it]Train:  43%|████▎     | 2139/5000 [8:32:22<11:20:19, 14.27s/it]Train:  43%|████▎     | 2140/5000 [8:32:36<11:20:35, 14.28s/it]                                                               {'loss': 1.69033146, 'token_acc': 0.63151666, 'grad_norm': 0.23956716, 'learning_rate': 1.315e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069545, 'epoch': 1.05, 'global_step/max_steps': '2140/5000', 'percentage': '42.80%', 'elapsed_time': '8h 32m 36s', 'remaining_time': '11h 25m 4s'}
+Train:  43%|████▎     | 2140/5000 [8:32:36<11:20:35, 14.28s/it]Train:  43%|████▎     | 2140/5000 [8:32:36<11:20:35, 14.28s/it]Train:  43%|████▎     | 2141/5000 [8:32:50<11:20:00, 14.27s/it]Train:  43%|████▎     | 2142/5000 [8:33:05<11:20:33, 14.29s/it]Train:  43%|████▎     | 2143/5000 [8:33:19<11:19:58, 14.28s/it]Train:  43%|████▎     | 2144/5000 [8:33:33<11:19:21, 14.27s/it]Train:  43%|████▎     | 2145/5000 [8:33:47<11:19:39, 14.28s/it]Train:  43%|████▎     | 2146/5000 [8:34:02<11:19:35, 14.29s/it]Train:  43%|████▎     | 2147/5000 [8:34:16<11:19:10, 14.28s/it]Train:  43%|████▎     | 2148/5000 [8:34:30<11:18:56, 14.28s/it]Train:  43%|████▎     | 2149/5000 [8:34:44<11:18:06, 14.27s/it]Train:  43%|████▎     | 2150/5000 [8:34:59<11:17:52, 14.27s/it]                                                               {'loss': 1.68687859, 'token_acc': 0.62668555, 'grad_norm': 0.23661539, 'learning_rate': 1.309e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069547, 'epoch': 1.05, 'global_step/max_steps': '2150/5000', 'percentage': '43.00%', 'elapsed_time': '8h 34m 59s', 'remaining_time': '11h 22m 39s'}
+Train:  43%|████▎     | 2150/5000 [8:34:59<11:17:52, 14.27s/it]Train:  43%|████▎     | 2150/5000 [8:34:59<11:17:52, 14.27s/it]Train:  43%|████▎     | 2151/5000 [8:35:13<11:17:58, 14.28s/it]Train:  43%|████▎     | 2152/5000 [8:35:27<11:17:32, 14.27s/it]Train:  43%|████▎     | 2153/5000 [8:35:42<11:17:25, 14.28s/it]Train:  43%|████▎     | 2154/5000 [8:35:56<11:17:36, 14.29s/it]Train:  43%|████▎     | 2155/5000 [8:36:10<11:17:34, 14.29s/it]Train:  43%|████▎     | 2156/5000 [8:36:24<11:17:04, 14.28s/it]Train:  43%|████▎     | 2157/5000 [8:36:39<11:16:23, 14.28s/it]Train:  43%|████▎     | 2158/5000 [8:36:53<11:15:51, 14.27s/it]Train:  43%|████▎     | 2159/5000 [8:37:07<11:15:23, 14.26s/it]Train:  43%|████▎     | 2160/5000 [8:37:22<11:15:22, 14.27s/it]                                                               {'loss': 1.69290104, 'token_acc': 0.62526304, 'grad_norm': 0.24287884, 'learning_rate': 1.303e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06955, 'epoch': 1.05, 'global_step/max_steps': '2160/5000', 'percentage': '43.20%', 'elapsed_time': '8h 37m 22s', 'remaining_time': '11h 20m 14s'}
+Train:  43%|████▎     | 2160/5000 [8:37:22<11:15:22, 14.27s/it]Train:  43%|████▎     | 2160/5000 [8:37:22<11:15:22, 14.27s/it]Train:  43%|████▎     | 2161/5000 [8:37:36<11:15:14, 14.27s/it]Train:  43%|████▎     | 2162/5000 [8:37:50<11:15:03, 14.27s/it]Train:  43%|████▎     | 2163/5000 [8:38:04<11:14:54, 14.27s/it]Train:  43%|████▎     | 2164/5000 [8:38:19<11:15:02, 14.28s/it]Train:  43%|████▎     | 2165/5000 [8:38:33<11:14:37, 14.28s/it]Train:  43%|████▎     | 2166/5000 [8:38:47<11:14:27, 14.28s/it]Train:  43%|████▎     | 2167/5000 [8:39:01<11:14:18, 14.28s/it]Train:  43%|████▎     | 2168/5000 [8:39:16<11:13:33, 14.27s/it]Train:  43%|████▎     | 2169/5000 [8:39:30<11:13:17, 14.27s/it]Train:  43%|████▎     | 2170/5000 [8:39:44<11:13:07, 14.27s/it]                                                               {'loss': 1.68420792, 'token_acc': 0.62759301, 'grad_norm': 0.24019074, 'learning_rate': 1.296e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069552, 'epoch': 1.05, 'global_step/max_steps': '2170/5000', 'percentage': '43.40%', 'elapsed_time': '8h 39m 44s', 'remaining_time': '11h 17m 49s'}
+Train:  43%|████▎     | 2170/5000 [8:39:44<11:13:07, 14.27s/it]Train:  43%|████▎     | 2170/5000 [8:39:44<11:13:07, 14.27s/it]Train:  43%|████▎     | 2171/5000 [8:39:59<11:13:04, 14.28s/it]Train:  43%|████▎     | 2172/5000 [8:40:13<11:12:57, 14.28s/it]Train:  43%|████▎     | 2173/5000 [8:40:27<11:12:48, 14.28s/it]Train:  43%|████▎     | 2174/5000 [8:40:41<11:12:25, 14.28s/it]Train:  44%|████▎     | 2175/5000 [8:40:56<11:12:37, 14.29s/it]Train:  44%|████▎     | 2176/5000 [8:41:10<11:13:16, 14.30s/it]Train:  44%|████▎     | 2177/5000 [8:41:24<11:13:16, 14.31s/it]Train:  44%|████▎     | 2178/5000 [8:41:39<11:12:31, 14.30s/it]Train:  44%|████▎     | 2179/5000 [8:41:53<11:11:27, 14.28s/it]Train:  44%|████▎     | 2180/5000 [8:42:07<11:11:29, 14.29s/it]                                                               {'loss': 1.69465981, 'token_acc': 0.62660536, 'grad_norm': 0.2510809, 'learning_rate': 1.29e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069554, 'epoch': 1.05, 'global_step/max_steps': '2180/5000', 'percentage': '43.60%', 'elapsed_time': '8h 42m 7s', 'remaining_time': '11h 15m 24s'}
+Train:  44%|████▎     | 2180/5000 [8:42:07<11:11:29, 14.29s/it]Train:  44%|████▎     | 2180/5000 [8:42:07<11:11:29, 14.29s/it]Train:  44%|████▎     | 2181/5000 [8:42:21<11:10:55, 14.28s/it]Train:  44%|████▎     | 2182/5000 [8:42:36<11:10:36, 14.28s/it]Train:  44%|████▎     | 2183/5000 [8:42:50<11:10:22, 14.28s/it]Train:  44%|████▎     | 2184/5000 [8:43:04<11:09:45, 14.27s/it]Train:  44%|████▎     | 2185/5000 [8:43:19<11:10:23, 14.29s/it]Train:  44%|████▎     | 2186/5000 [8:43:33<11:10:21, 14.29s/it]Train:  44%|████▎     | 2187/5000 [8:43:47<11:10:21, 14.30s/it]Train:  44%|████▍     | 2188/5000 [8:44:01<11:09:53, 14.29s/it]Train:  44%|████▍     | 2189/5000 [8:44:16<11:09:37, 14.29s/it]Train:  44%|████▍     | 2190/5000 [8:44:30<11:09:04, 14.29s/it]                                                               {'loss': 1.69417, 'token_acc': 0.63225802, 'grad_norm': 0.24738197, 'learning_rate': 1.284e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069556, 'epoch': 1.06, 'global_step/max_steps': '2190/5000', 'percentage': '43.80%', 'elapsed_time': '8h 44m 30s', 'remaining_time': '11h 13m 0s'}
+Train:  44%|████▍     | 2190/5000 [8:44:30<11:09:04, 14.29s/it]Train:  44%|████▍     | 2190/5000 [8:44:30<11:09:04, 14.29s/it]Train:  44%|████▍     | 2191/5000 [8:44:44<11:08:41, 14.28s/it]Train:  44%|████▍     | 2192/5000 [8:44:59<11:08:35, 14.29s/it]Train:  44%|████▍     | 2193/5000 [8:45:13<11:08:17, 14.28s/it]Train:  44%|████▍     | 2194/5000 [8:45:27<11:07:50, 14.28s/it]Train:  44%|████▍     | 2195/5000 [8:45:41<11:07:16, 14.27s/it]Train:  44%|████▍     | 2196/5000 [8:45:56<11:07:15, 14.28s/it]Train:  44%|████▍     | 2197/5000 [8:46:10<11:06:37, 14.27s/it]Train:  44%|████▍     | 2198/5000 [8:46:24<11:06:50, 14.28s/it]Train:  44%|████▍     | 2199/5000 [8:46:39<11:06:40, 14.28s/it]Train:  44%|████▍     | 2200/5000 [8:46:53<11:06:37, 14.28s/it]                                                               {'loss': 1.697995, 'token_acc': 0.63445025, 'grad_norm': 0.24279156, 'learning_rate': 1.277e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069558, 'epoch': 1.06, 'global_step/max_steps': '2200/5000', 'percentage': '44.00%', 'elapsed_time': '8h 46m 53s', 'remaining_time': '11h 10m 35s'}
+Train:  44%|████▍     | 2200/5000 [8:46:53<11:06:37, 14.28s/it]Train:  44%|████▍     | 2200/5000 [8:46:53<11:06:37, 14.28s/it]Train:  44%|████▍     | 2201/5000 [8:47:07<11:06:45, 14.29s/it]Train:  44%|████▍     | 2202/5000 [8:47:21<11:06:47, 14.30s/it]Train:  44%|████▍     | 2203/5000 [8:47:36<11:06:14, 14.29s/it]Train:  44%|████▍     | 2204/5000 [8:47:50<11:06:21, 14.30s/it]Train:  44%|████▍     | 2205/5000 [8:48:04<11:06:08, 14.30s/it]Train:  44%|████▍     | 2206/5000 [8:48:19<11:05:33, 14.29s/it]Train:  44%|████▍     | 2207/5000 [8:48:33<11:05:10, 14.29s/it]Train:  44%|████▍     | 2208/5000 [8:48:47<11:04:58, 14.29s/it]Train:  44%|████▍     | 2209/5000 [8:49:01<11:04:28, 14.28s/it]Train:  44%|████▍     | 2210/5000 [8:49:16<11:04:27, 14.29s/it]                                                               {'loss': 1.6755167, 'token_acc': 0.63277096, 'grad_norm': 0.24841964, 'learning_rate': 1.271e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06956, 'epoch': 1.06, 'global_step/max_steps': '2210/5000', 'percentage': '44.20%', 'elapsed_time': '8h 49m 16s', 'remaining_time': '11h 8m 10s'}
+Train:  44%|████▍     | 2210/5000 [8:49:16<11:04:27, 14.29s/it]Train:  44%|████▍     | 2210/5000 [8:49:16<11:04:27, 14.29s/it]Train:  44%|████▍     | 2211/5000 [8:49:30<11:04:00, 14.28s/it]Train:  44%|████▍     | 2212/5000 [8:49:44<11:04:08, 14.29s/it]Train:  44%|████▍     | 2213/5000 [8:49:59<11:03:51, 14.29s/it]Train:  44%|████▍     | 2214/5000 [8:50:13<11:04:12, 14.30s/it]Train:  44%|████▍     | 2215/5000 [8:50:27<11:03:32, 14.30s/it]Train:  44%|████▍     | 2216/5000 [8:50:42<11:03:39, 14.30s/it]Train:  44%|████▍     | 2217/5000 [8:50:56<11:03:20, 14.30s/it]Train:  44%|████▍     | 2218/5000 [8:51:10<11:02:37, 14.29s/it]Train:  44%|████▍     | 2219/5000 [8:51:24<11:02:34, 14.30s/it]Train:  44%|████▍     | 2220/5000 [8:51:39<11:01:48, 14.28s/it]                                                               {'loss': 1.68069725, 'token_acc': 0.63143062, 'grad_norm': 0.2379085, 'learning_rate': 1.265e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069562, 'epoch': 1.06, 'global_step/max_steps': '2220/5000', 'percentage': '44.40%', 'elapsed_time': '8h 51m 39s', 'remaining_time': '11h 5m 45s'}
+Train:  44%|████▍     | 2220/5000 [8:51:39<11:01:48, 14.28s/it]Train:  44%|████▍     | 2220/5000 [8:51:39<11:01:48, 14.28s/it]Train:  44%|████▍     | 2221/5000 [8:51:53<11:00:51, 14.27s/it]Train:  44%|████▍     | 2222/5000 [8:52:07<10:59:55, 14.25s/it]Train:  44%|████▍     | 2223/5000 [8:52:21<11:00:21, 14.27s/it]Train:  44%|████▍     | 2224/5000 [8:52:36<10:59:50, 14.26s/it]Train:  44%|████▍     | 2225/5000 [8:52:50<11:00:10, 14.27s/it]Train:  45%|████▍     | 2226/5000 [8:53:04<10:59:47, 14.27s/it]Train:  45%|████▍     | 2227/5000 [8:53:19<10:59:36, 14.27s/it]Train:  45%|████▍     | 2228/5000 [8:53:33<10:59:56, 14.28s/it]Train:  45%|████▍     | 2229/5000 [8:53:47<10:59:45, 14.29s/it]Train:  45%|████▍     | 2230/5000 [8:54:01<10:59:15, 14.28s/it]                                                               {'loss': 1.6946146, 'token_acc': 0.63288572, 'grad_norm': 0.23251252, 'learning_rate': 1.258e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069564, 'epoch': 1.06, 'global_step/max_steps': '2230/5000', 'percentage': '44.60%', 'elapsed_time': '8h 54m 1s', 'remaining_time': '11h 3m 20s'}
+Train:  45%|████▍     | 2230/5000 [8:54:01<10:59:15, 14.28s/it]Train:  45%|████▍     | 2230/5000 [8:54:01<10:59:15, 14.28s/it]Train:  45%|████▍     | 2231/5000 [8:54:16<10:59:55, 14.30s/it]Train:  45%|████▍     | 2232/5000 [8:54:30<10:59:26, 14.29s/it]Train:  45%|████▍     | 2233/5000 [8:54:44<10:58:45, 14.28s/it]Train:  45%|████▍     | 2234/5000 [8:54:59<10:58:58, 14.29s/it]Train:  45%|████▍     | 2235/5000 [8:55:13<10:58:34, 14.29s/it]Train:  45%|████▍     | 2236/5000 [8:55:27<10:58:12, 14.29s/it]Train:  45%|████▍     | 2237/5000 [8:55:41<10:58:05, 14.29s/it]Train:  45%|████▍     | 2238/5000 [8:55:56<10:57:38, 14.29s/it]Train:  45%|████▍     | 2239/5000 [8:56:10<10:57:42, 14.29s/it]Train:  45%|████▍     | 2240/5000 [8:56:24<10:56:48, 14.28s/it]                                                               {'loss': 1.68621635, 'token_acc': 0.63107818, 'grad_norm': 0.24746247, 'learning_rate': 1.252e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069566, 'epoch': 1.07, 'global_step/max_steps': '2240/5000', 'percentage': '44.80%', 'elapsed_time': '8h 56m 24s', 'remaining_time': '11h 0m 56s'}
+Train:  45%|████▍     | 2240/5000 [8:56:24<10:56:48, 14.28s/it]Train:  45%|████▍     | 2240/5000 [8:56:24<10:56:48, 14.28s/it]Train:  45%|████▍     | 2241/5000 [8:56:39<10:56:56, 14.29s/it]Train:  45%|████▍     | 2242/5000 [8:56:53<10:56:45, 14.29s/it]Train:  45%|████▍     | 2243/5000 [8:57:07<10:56:43, 14.29s/it]Train:  45%|████▍     | 2244/5000 [8:57:21<10:56:15, 14.29s/it]Train:  45%|████▍     | 2245/5000 [8:57:36<10:55:37, 14.28s/it]Train:  45%|████▍     | 2246/5000 [8:57:50<10:55:15, 14.28s/it]Train:  45%|████▍     | 2247/5000 [8:58:04<10:55:29, 14.29s/it]Train:  45%|████▍     | 2248/5000 [8:58:19<10:55:34, 14.29s/it]Train:  45%|████▍     | 2249/5000 [8:58:33<10:54:58, 14.29s/it]Train:  45%|████▌     | 2250/5000 [8:58:47<10:55:01, 14.29s/it]                                                               {'loss': 1.69330101, 'token_acc': 0.6366411, 'grad_norm': 0.23901613, 'learning_rate': 1.245e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069568, 'epoch': 1.07, 'global_step/max_steps': '2250/5000', 'percentage': '45.00%', 'elapsed_time': '8h 58m 47s', 'remaining_time': '10h 58m 31s'}
+Train:  45%|████▌     | 2250/5000 [8:58:47<10:55:01, 14.29s/it]Train:  45%|████▌     | 2250/5000 [8:58:47<10:55:01, 14.29s/it]Train:  45%|███��▌     | 2251/5000 [8:59:01<10:54:20, 14.28s/it]Train:  45%|████▌     | 2252/5000 [8:59:16<10:55:05, 14.30s/it]Train:  45%|████▌     | 2253/5000 [8:59:30<10:53:55, 14.28s/it]Train:  45%|████▌     | 2254/5000 [8:59:44<10:54:08, 14.29s/it]Train:  45%|████▌     | 2255/5000 [8:59:59<10:54:27, 14.31s/it]Train:  45%|████▌     | 2256/5000 [9:00:13<10:54:08, 14.30s/it]Train:  45%|████▌     | 2257/5000 [9:00:27<10:53:37, 14.30s/it]Train:  45%|████▌     | 2258/5000 [9:00:42<10:52:31, 14.28s/it]Train:  45%|████▌     | 2259/5000 [9:00:56<10:51:44, 14.27s/it]Train:  45%|████▌     | 2260/5000 [9:01:10<10:51:26, 14.27s/it]                                                               {'loss': 1.69679031, 'token_acc': 0.62638123, 'grad_norm': 0.24073748, 'learning_rate': 1.239e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.06957, 'epoch': 1.07, 'global_step/max_steps': '2260/5000', 'percentage': '45.20%', 'elapsed_time': '9h 1m 10s', 'remaining_time': '10h 56m 6s'}
+Train:  45%|████▌     | 2260/5000 [9:01:10<10:51:26, 14.27s/it]Train:  45%|████▌     | 2260/5000 [9:01:10<10:51:26, 14.27s/it]Train:  45%|████▌     | 2261/5000 [9:01:24<10:51:29, 14.27s/it]Train:  45%|████▌     | 2262/5000 [9:01:39<10:51:08, 14.27s/it]Train:  45%|████▌     | 2263/5000 [9:01:53<10:51:39, 14.29s/it]Train:  45%|████▌     | 2264/5000 [9:02:07<10:50:41, 14.27s/it]Train:  45%|████▌     | 2265/5000 [9:02:21<10:50:14, 14.26s/it]Train:  45%|████▌     | 2266/5000 [9:02:36<10:50:17, 14.27s/it]Train:  45%|████▌     | 2267/5000 [9:02:50<10:50:09, 14.27s/it]Train:  45%|████▌     | 2268/5000 [9:03:04<10:49:53, 14.27s/it]Train:  45%|████▌     | 2269/5000 [9:03:18<10:49:35, 14.27s/it]Train:  45%|████▌     | 2270/5000 [9:03:33<10:49:38, 14.28s/it]                                                               {'loss': 1.68764248, 'token_acc': 0.62901272, 'grad_norm': 0.24956495, 'learning_rate': 1.233e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069572, 'epoch': 1.07, 'global_step/max_steps': '2270/5000', 'percentage': '45.40%', 'elapsed_time': '9h 3m 33s', 'remaining_time': '10h 53m 42s'}
+Train:  45%|████▌     | 2270/5000 [9:03:33<10:49:38, 14.28s/it]Train:  45%|████▌     | 2270/5000 [9:03:33<10:49:38, 14.28s/it]Train:  45%|████▌     | 2271/5000 [9:03:47<10:49:08, 14.27s/it]Train:  45%|████▌     | 2272/5000 [9:04:01<10:48:42, 14.27s/it]Train:  45%|████▌     | 2273/5000 [9:04:16<10:47:48, 14.25s/it]Train:  45%|████▌     | 2274/5000 [9:04:30<10:47:35, 14.25s/it]Train:  46%|████▌     | 2275/5000 [9:04:44<10:47:38, 14.26s/it]Train:  46%|████▌     | 2276/5000 [9:04:58<10:47:46, 14.27s/it]Train:  46%|████▌     | 2277/5000 [9:05:13<10:47:00, 14.26s/it]Train:  46%|████▌     | 2278/5000 [9:05:27<10:46:50, 14.26s/it]Train:  46%|████▌     | 2279/5000 [9:05:41<10:45:56, 14.24s/it]Train:  46%|████▌     | 2280/5000 [9:05:55<10:45:57, 14.25s/it]                                                               {'loss': 1.69011078, 'token_acc': 0.62887436, 'grad_norm': 0.24122818, 'learning_rate': 1.226e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069574, 'epoch': 1.07, 'global_step/max_steps': '2280/5000', 'percentage': '45.60%', 'elapsed_time': '9h 5m 55s', 'remaining_time': '10h 51m 17s'}
+Train:  46%|████▌     | 2280/5000 [9:05:55<10:45:57, 14.25s/it]Train:  46%|████▌     | 2280/5000 [9:05:55<10:45:57, 14.25s/it]Train:  46%|████▌     | 2281/5000 [9:06:10<10:45:26, 14.24s/it]Train:  46%|████▌     | 2282/5000 [9:06:24<10:44:55, 14.24s/it]Train:  46%|████▌     | 2283/5000 [9:06:38<10:44:51, 14.24s/it]Train:  46%|████▌     | 2284/5000 [9:06:52<10:44:35, 14.24s/it]Train:  46%|████▌     | 2285/5000 [9:07:06<10:44:08, 14.24s/it]Train:  46%|████▌     | 2286/5000 [9:07:21<10:43:30, 14.23s/it]Train:  46%|████▌     | 2287/5000 [9:07:35<10:43:10, 14.22s/it]Train:  46%|████▌     | 2288/5000 [9:07:49<10:42:56, 14.22s/it]Train:  46%|████▌     | 2289/5000 [9:08:03<10:42:58, 14.23s/it]Train:  46%|████▌     | 2290/5000 [9:08:18<10:43:03, 14.24s/it]                                                               {'loss': 1.69083443, 'token_acc': 0.62787656, 'grad_norm': 0.24037641, 'learning_rate': 1.22e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069577, 'epoch': 1.08, 'global_step/max_steps': '2290/5000', 'percentage': '45.80%', 'elapsed_time': '9h 8m 18s', 'remaining_time': '10h 48m 51s'}
+Train:  46%|████▌     | 2290/5000 [9:08:18<10:43:03, 14.24s/it]Train:  46%|████▌     | 2290/5000 [9:08:18<10:43:03, 14.24s/it]Train:  46%|████▌     | 2291/5000 [9:08:32<10:43:09, 14.24s/it]Train:  46%|████▌     | 2292/5000 [9:08:46<10:43:15, 14.25s/it]Train:  46%|████▌     | 2293/5000 [9:09:00<10:42:38, 14.24s/it]Train:  46%|████▌     | 2294/5000 [9:09:15<10:42:58, 14.26s/it]Train:  46%|████▌     | 2295/5000 [9:09:29<10:43:07, 14.27s/it]Train:  46%|████▌     | 2296/5000 [9:09:43<10:43:39, 14.28s/it]Train:  46%|████▌     | 2297/5000 [9:09:58<10:43:26, 14.28s/it]Train:  46%|████▌     | 2298/5000 [9:10:12<10:43:17, 14.28s/it]Train:  46%|████▌     | 2299/5000 [9:10:26<10:42:24, 14.27s/it]Train:  46%|████▌     | 2300/5000 [9:10:40<10:41:47, 14.26s/it]                                                               {'loss': 1.69811745, 'token_acc': 0.62523238, 'grad_norm': 0.24015376, 'learning_rate': 1.213e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069579, 'epoch': 1.08, 'global_step/max_steps': '2300/5000', 'percentage': '46.00%', 'elapsed_time': '9h 10m 40s', 'remaining_time': '10h 46m 27s'}
+Train:  46%|████▌     | 2300/5000 [9:10:40<10:41:47, 14.26s/it]Train:  46%|████▌     | 2300/5000 [9:10:40<10:41:47, 14.26s/it]Train:  46%|████▌     | 2301/5000 [9:10:55<10:41:19, 14.26s/it]Train:  46%|████▌     | 2302/5000 [9:11:09<10:41:52, 14.27s/it]Train:  46%|████▌     | 2303/5000 [9:11:23<10:42:07, 14.29s/it]Train:  46%|████▌     | 2304/5000 [9:11:37<10:41:13, 14.27s/it]Train:  46%|████▌     | 2305/5000 [9:11:52<10:41:37, 14.28s/it]Train:  46%|████▌     | 2306/5000 [9:12:06<10:41:14, 14.28s/it]Train:  46%|████▌     | 2307/5000 [9:12:20<10:40:27, 14.27s/it]Train:  46%|████▌     | 2308/5000 [9:12:34<10:39:42, 14.26s/it]Train:  46%|████▌     | 2309/5000 [9:12:49<10:39:10, 14.25s/it]Train:  46%|████▌     | 2310/5000 [9:13:03<10:39:22, 14.26s/it]                                                               {'loss': 1.68466415, 'token_acc': 0.63200847, 'grad_norm': 0.23854281, 'learning_rate': 1.207e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069582, 'epoch': 1.08, 'global_step/max_steps': '2310/5000', 'percentage': '46.20%', 'elapsed_time': '9h 13m 3s', 'remaining_time': '10h 44m 2s'}
+Train:  46%|████▌     | 2310/5000 [9:13:03<10:39:22, 14.26s/it]Train:  46%|████▌     | 2310/5000 [9:13:03<10:39:22, 14.26s/it]Train:  46%|████▌     | 2311/5000 [9:13:17<10:39:41, 14.27s/it]Train:  46%|████▌     | 2312/5000 [9:13:32<10:39:10, 14.27s/it]Train:  46%|████▋     | 2313/5000 [9:13:46<10:39:13, 14.27s/it]Train:  46%|████▋     | 2314/5000 [9:14:00<10:38:21, 14.26s/it]Train:  46%|████▋     | 2315/5000 [9:14:14<10:38:35, 14.27s/it]Train:  46%|████▋     | 2316/5000 [9:14:29<10:39:01, 14.29s/it]Train:  46%|████▋     | 2317/5000 [9:14:43<10:38:18, 14.27s/it]Train:  46%|████▋     | 2318/5000 [9:14:57<10:38:21, 14.28s/it]Train:  46%|████▋     | 2319/5000 [9:15:12<10:38:16, 14.28s/it]Train:  46%|████▋     | 2320/5000 [9:15:26<10:37:59, 14.28s/it]                                                               {'loss': 1.66379871, 'token_acc': 0.63734302, 'grad_norm': 0.23737107, 'learning_rate': 1.2e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069584, 'epoch': 1.08, 'global_step/max_steps': '2320/5000', 'percentage': '46.40%', 'elapsed_time': '9h 15m 26s', 'remaining_time': '10h 41m 37s'}
+Train:  46%|████▋     | 2320/5000 [9:15:26<10:37:59, 14.28s/it]Train:  46%|████▋     | 2320/5000 [9:15:26<10:37:59, 14.28s/it]Train:  46%|████▋     | 2321/5000 [9:15:40<10:37:50, 14.29s/it]Train:  46%|████▋     | 2322/5000 [9:15:54<10:37:25, 14.28s/it]Train:  46%|████▋     | 2323/5000 [9:16:09<10:37:03, 14.28s/it]Train:  46%|████▋     | 2324/5000 [9:16:23<10:37:06, 14.28s/it]Train:  46%|████▋     | 2325/5000 [9:16:37<10:37:02, 14.29s/it]Train:  47%|████▋     | 2326/5000 [9:16:52<10:36:57, 14.29s/it]Train:  47%|████▋     | 2327/5000 [9:17:06<10:35:56, 14.27s/it]Train:  47%|████▋     | 2328/5000 [9:17:20<10:36:07, 14.28s/it]Train:  47%|████▋     | 2329/5000 [9:17:34<10:35:41, 14.28s/it]Train:  47%|████▋     | 2330/5000 [9:17:49<10:35:34, 14.28s/it]                                                               {'loss': 1.6757267, 'token_acc': 0.63337772, 'grad_norm': 0.24158044, 'learning_rate': 1.194e-05, 'memory(GiB)': 129.54, 'train_speed(iter/s)': 0.069585, 'epoch': 1.08, 'global_step/max_steps': '2330/5000', 'percentage': '46.60%', 'elapsed_time': '9h 17m 49s', 'remaining_time': '10h 39m 13s'}
+Train:  47%|████▋     | 2330/5000 [9:17:49<10:35:34, 14.28s/it]Train:  47%|████▋     | 2330/5000 [9:17:49<10:35:34, 14.28s/it]Train:  47%|████▋     | 2331/5000 [9:18:03<10:34:59, 14.27s/it]Train:  47%|████▋     | 2332/5000 [9:18:17<10:34:31, 14.27s/it]Train:  47%|████▋     | 2333/5000 [9:18:31<10:34:22, 14.27s/it]Train:  47%|████▋     | 2334/5000 [9:18:46<10:33:51, 14.27s/it]Train:  47%|████▋     | 2335/5000 [9:19:00<10:33:40, 14.27s/it]Train:  47%|████▋     | 2336/5000 [9:19:14<10:33:32, 14.27s/it]Train:  47%|████▋     | 2337/5000 [9:19:29<10:33:43, 14.28s/it]Train:  47%|████▋     | 2338/5000 [9:19:43<10:33:51, 14.29s/it]Train:  47%|████▋     | 2339/5000 [9:19:57<10:33:57, 14.29s/it]Train:  47%|████▋     | 2340/5000 [9:20:11<10:33:51, 14.30s/it]                                                               {'loss': 1.69442959, 'token_acc': 0.63006123, 'grad_norm': 0.23659058, 'learning_rate': 1.187e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069587, 'epoch': 1.09, 'global_step/max_steps': '2340/5000', 'percentage': '46.80%', 'elapsed_time': '9h 20m 11s', 'remaining_time': '10h 36m 48s'}
+Train:  47%|████▋     | 2340/5000 [9:20:11<10:33:51, 14.30s/it]Train:  47%|████▋     | 2340/5000 [9:20:11<10:33:51, 14.30s/it]Train:  47%|████▋     | 2341/5000 [9:20:26<10:33:29, 14.29s/it]Train:  47%|████▋     | 2342/5000 [9:20:40<10:33:18, 14.30s/it]Train:  47%|████▋     | 2343/5000 [9:20:54<10:33:35, 14.31s/it]Train:  47%|████▋     | 2344/5000 [9:21:09<10:32:50, 14.30s/it]Train:  47%|████▋     | 2345/5000 [9:21:23<10:32:21, 14.29s/it]Train:  47%|████▋     | 2346/5000 [9:21:37<10:32:23, 14.30s/it]Train:  47%|████▋     | 2347/5000 [9:21:51<10:31:53, 14.29s/it]Train:  47%|████▋     | 2348/5000 [9:22:06<10:31:35, 14.29s/it]Train:  47%|████▋     | 2349/5000 [9:22:20<10:31:29, 14.29s/it]Train:  47%|████▋     | 2350/5000 [9:22:34<10:31:33, 14.30s/it]                                                               {'loss': 1.67870274, 'token_acc': 0.63880164, 'grad_norm': 0.24583176, 'learning_rate': 1.181e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069589, 'epoch': 1.09, 'global_step/max_steps': '2350/5000', 'percentage': '47.00%', 'elapsed_time': '9h 22m 34s', 'remaining_time': '10h 34m 24s'}
+Train:  47%|████▋     | 2350/5000 [9:22:34<10:31:33, 14.30s/it]Train:  47%|████▋     | 2350/5000 [9:22:34<10:31:33, 14.30s/it]Train:  47%|████▋     | 2351/5000 [9:22:49<10:31:01, 14.29s/it]Train:  47%|████▋     | 2352/5000 [9:23:03<10:30:27, 14.29s/it]Train:  47%|████▋     | 2353/5000 [9:23:17<10:29:43, 14.27s/it]Train:  47%|████▋     | 2354/5000 [9:23:31<10:29:52, 14.28s/it]Train:  47%|████▋     | 2355/5000 [9:23:46<10:29:48, 14.29s/it]Train:  47%|████▋     | 2356/5000 [9:24:00<10:29:41, 14.29s/it]Train:  47%|████▋     | 2357/5000 [9:24:14<10:30:02, 14.30s/it]Train:  47%|████▋     | 2358/5000 [9:24:29<10:29:28, 14.30s/it]Train:  47%|████▋     | 2359/5000 [9:24:43<10:29:23, 14.30s/it]Train:  47%|████▋     | 2360/5000 [9:24:57<10:28:53, 14.29s/it]                                                               {'loss': 1.69151249, 'token_acc': 0.63493667, 'grad_norm': 0.25312823, 'learning_rate': 1.174e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06959, 'epoch': 1.09, 'global_step/max_steps': '2360/5000', 'percentage': '47.20%', 'elapsed_time': '9h 24m 57s', 'remaining_time': '10h 31m 59s'}
+Train:  47%|████▋     | 2360/5000 [9:24:57<10:28:53, 14.29s/it]Train:  47%|████▋     | 2360/5000 [9:24:57<10:28:53, 14.29s/it]Train:  47%|████▋     | 2361/5000 [9:25:12<10:28:05, 14.28s/it]Train:  47%|████▋     | 2362/5000 [9:25:26<10:27:33, 14.27s/it]Train:  47%|████▋     | 2363/5000 [9:25:40<10:27:31, 14.28s/it]Train:  47%|████▋     | 2364/5000 [9:25:54<10:27:16, 14.28s/it]Train:  47%|████▋     | 2365/5000 [9:26:09<10:27:06, 14.28s/it]Train:  47%|████▋     | 2366/5000 [9:26:23<10:27:00, 14.28s/it]Train:  47%|████▋     | 2367/5000 [9:26:37<10:26:36, 14.28s/it]Train:  47%|████▋     | 2368/5000 [9:26:52<10:26:57, 14.29s/it]Train:  47%|████▋     | 2369/5000 [9:27:06<10:26:56, 14.30s/it]Train:  47%|████▋     | 2370/5000 [9:27:20<10:26:19, 14.29s/it]                                                               {'loss': 1.67789803, 'token_acc': 0.62949117, 'grad_norm': 0.24462275, 'learning_rate': 1.168e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069592, 'epoch': 1.09, 'global_step/max_steps': '2370/5000', 'percentage': '47.40%', 'elapsed_time': '9h 27m 20s', 'remaining_time': '10h 29m 35s'}
+Train:  47%|████▋     | 2370/5000 [9:27:20<10:26:19, 14.29s/it]Train:  47%|████▋     | 2370/5000 [9:27:20<10:26:19, 14.29s/it]Train:  47%|████▋     | 2371/5000 [9:27:34<10:26:26, 14.30s/it]Train:  47%|████▋     | 2372/5000 [9:27:49<10:26:17, 14.30s/it]Train:  47%|████▋     | 2373/5000 [9:28:03<10:26:13, 14.30s/it]Train:  47%|████▋     | 2374/5000 [9:28:17<10:26:05, 14.31s/it]Train:  48%|████▊     | 2375/5000 [9:28:32<10:25:27, 14.30s/it]Train:  48%|████▊     | 2376/5000 [9:28:46<10:25:28, 14.30s/it]Train:  48%|████▊     | 2377/5000 [9:29:00<10:24:33, 14.29s/it]Train:  48%|████▊     | 2378/5000 [9:29:14<10:24:12, 14.28s/it]Train:  48%|████▊     | 2379/5000 [9:29:29<10:24:13, 14.29s/it]Train:  48%|████▊     | 2380/5000 [9:29:43<10:23:59, 14.29s/it]                                                               {'loss': 1.68625107, 'token_acc': 0.62539092, 'grad_norm': 0.23916645, 'learning_rate': 1.161e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069594, 'epoch': 1.09, 'global_step/max_steps': '2380/5000', 'percentage': '47.60%', 'elapsed_time': '9h 29m 43s', 'remaining_time': '10h 27m 10s'}
+Train:  48%|████▊     | 2380/5000 [9:29:43<10:23:59, 14.29s/it]Train:  48%|████▊     | 2380/5000 [9:29:43<10:23:59, 14.29s/it]Train:  48%|████▊     | 2381/5000 [9:29:57<10:22:58, 14.27s/it]Train:  48%|████▊     | 2382/5000 [9:30:12<10:22:49, 14.27s/it]Train:  48%|████▊     | 2383/5000 [9:30:26<10:22:17, 14.27s/it]Train:  48%|████▊     | 2384/5000 [9:30:40<10:22:06, 14.27s/it]Train:  48%|████▊     | 2385/5000 [9:30:54<10:22:01, 14.27s/it]Train:  48%|████▊     | 2386/5000 [9:31:09<10:21:13, 14.26s/it]Train:  48%|████▊     | 2387/5000 [9:31:23<10:21:56, 14.28s/it]Train:  48%|████▊     | 2388/5000 [9:31:37<10:21:47, 14.28s/it]Train:  48%|████▊     | 2389/5000 [9:31:51<10:21:09, 14.27s/it]Train:  48%|████▊     | 2390/5000 [9:32:06<10:20:19, 14.26s/it]                                                               {'loss': 1.69327469, 'token_acc': 0.62623246, 'grad_norm': 0.23769847, 'learning_rate': 1.155e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069596, 'epoch': 1.1, 'global_step/max_steps': '2390/5000', 'percentage': '47.80%', 'elapsed_time': '9h 32m 6s', 'remaining_time': '10h 24m 45s'}
+Train:  48%|████▊     | 2390/5000 [9:32:06<10:20:19, 14.26s/it]Train:  48%|████▊     | 2390/5000 [9:32:06<10:20:19, 14.26s/it]Train:  48%|████▊     | 2391/5000 [9:32:20<10:20:17, 14.26s/it]Train:  48%|████▊     | 2392/5000 [9:32:34<10:20:28, 14.27s/it]Train:  48%|████▊     | 2393/5000 [9:32:49<10:19:53, 14.27s/it]Train:  48%|████▊     | 2394/5000 [9:33:03<10:19:21, 14.26s/it]Train:  48%|████▊     | 2395/5000 [9:33:17<10:19:08, 14.26s/it]Train:  48%|████▊     | 2396/5000 [9:33:31<10:19:12, 14.27s/it]Train:  48%|████▊     | 2397/5000 [9:33:46<10:18:33, 14.26s/it]Train:  48%|████▊     | 2398/5000 [9:34:00<10:18:28, 14.26s/it]Train:  48%|████▊     | 2399/5000 [9:34:14<10:18:11, 14.26s/it]Train:  48%|████▊     | 2400/5000 [9:34:28<10:18:24, 14.27s/it]                                                               {'loss': 1.67924404, 'token_acc': 0.62597078, 'grad_norm': 0.2314907, 'learning_rate': 1.148e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069598, 'epoch': 1.1, 'global_step/max_steps': '2400/5000', 'percentage': '48.00%', 'elapsed_time': '9h 34m 28s', 'remaining_time': '10h 22m 21s'}
+Train:  48%|████▊     | 2400/5000 [9:34:28<10:18:24, 14.27s/it]Train:  48%|████▊     | 2400/5000 [9:34:28<10:18:24, 14.27s/it]Train:  48%|████▊     | 2401/5000 [9:34:43<10:18:02, 14.27s/it]Train:  48%|████▊     | 2402/5000 [9:34:57<10:17:42, 14.27s/it]Train:  48%|████▊     | 2403/5000 [9:35:11<10:17:58, 14.28s/it]Train:  48%|████▊     | 2404/5000 [9:35:25<10:17:43, 14.28s/it]Train:  48%|████▊     | 2405/5000 [9:35:40<10:17:30, 14.28s/it]Train:  48%|████▊     | 2406/5000 [9:35:54<10:17:36, 14.29s/it]Train:  48%|████▊     | 2407/5000 [9:36:08<10:17:41, 14.29s/it]Train:  48%|████▊     | 2408/5000 [9:36:23<10:17:19, 14.29s/it]Train:  48%|████▊     | 2409/5000 [9:36:37<10:17:10, 14.29s/it]Train:  48%|████▊     | 2410/5000 [9:36:51<10:16:58, 14.29s/it]                                                               {'loss': 1.67545357, 'token_acc': 0.63128929, 'grad_norm': 0.24583179, 'learning_rate': 1.142e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.0696, 'epoch': 1.1, 'global_step/max_steps': '2410/5000', 'percentage': '48.20%', 'elapsed_time': '9h 36m 51s', 'remaining_time': '10h 19m 56s'}
+Train:  48%|████▊     | 2410/5000 [9:36:51<10:16:58, 14.29s/it]Train:  48%|████▊     | 2410/5000 [9:36:51<10:16:58, 14.29s/it]Train:  48%|████▊     | 2411/5000 [9:37:06<10:16:43, 14.29s/it]Train:  48%|████▊     | 2412/5000 [9:37:20<10:16:05, 14.28s/it]Train:  48%|████▊     | 2413/5000 [9:37:34<10:15:29, 14.28s/it]Train:  48%|████▊     | 2414/5000 [9:37:48<10:15:08, 14.27s/it]Train:  48%|████▊     | 2415/5000 [9:38:03<10:14:44, 14.27s/it]Train:  48%|████▊     | 2416/5000 [9:38:17<10:14:45, 14.27s/it]Train:  48%|████▊     | 2417/5000 [9:38:31<10:14:36, 14.28s/it]Train:  48%|████▊     | 2418/5000 [9:38:45<10:13:59, 14.27s/it]Train:  48%|████▊     | 2419/5000 [9:39:00<10:13:50, 14.27s/it]Train:  48%|████▊     | 2420/5000 [9:39:14<10:13:58, 14.28s/it]                                                               {'loss': 1.68599854, 'token_acc': 0.62018546, 'grad_norm': 0.23634142, 'learning_rate': 1.135e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069601, 'epoch': 1.1, 'global_step/max_steps': '2420/5000', 'percentage': '48.40%', 'elapsed_time': '9h 39m 14s', 'remaining_time': '10h 17m 32s'}
+Train:  48%|████▊     | 2420/5000 [9:39:14<10:13:58, 14.28s/it]Train:  48%|████▊     | 2420/5000 [9:39:14<10:13:58, 14.28s/it]Train:  48%|████▊     | 2421/5000 [9:39:28<10:13:07, 14.26s/it]Train:  48%|████▊     | 2422/5000 [9:39:42<10:12:50, 14.26s/it]Train:  48%|████▊     | 2423/5000 [9:39:57<10:12:39, 14.26s/it]Train:  48%|████▊     | 2424/5000 [9:40:11<10:12:39, 14.27s/it]Train:  48%|████▊     | 2425/5000 [9:40:25<10:12:38, 14.28s/it]Train:  49%|████▊     | 2426/5000 [9:40:40<10:13:19, 14.30s/it]Train:  49%|████▊     | 2427/5000 [9:40:54<10:12:43, 14.29s/it]Train:  49%|████▊     | 2428/5000 [9:41:08<10:12:48, 14.30s/it]Train:  49%|████▊     | 2429/5000 [9:41:23<10:12:26, 14.29s/it]Train:  49%|████▊     | 2430/5000 [9:41:37<10:12:39, 14.30s/it]                                                               {'loss': 1.68382187, 'token_acc': 0.63201999, 'grad_norm': 0.24079922, 'learning_rate': 1.129e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069603, 'epoch': 1.1, 'global_step/max_steps': '2430/5000', 'percentage': '48.60%', 'elapsed_time': '9h 41m 37s', 'remaining_time': '10h 15m 7s'}
+Train:  49%|████▊     | 2430/5000 [9:41:37<10:12:39, 14.30s/it]Train:  49%|████▊     | 2430/5000 [9:41:37<10:12:39, 14.30s/it]Train:  49%|████▊     | 2431/5000 [9:41:51<10:12:20, 14.30s/it]Train:  49%|████▊     | 2432/5000 [9:42:05<10:12:02, 14.30s/it]Train:  49%|████▊     | 2433/5000 [9:42:20<10:10:59, 14.28s/it]Train:  49%|████▊     | 2434/5000 [9:42:34<10:10:35, 14.28s/it]Train:  49%|████▊     | 2435/5000 [9:42:48<10:10:43, 14.29s/it]Train:  49%|████▊     | 2436/5000 [9:43:03<10:10:56, 14.30s/it]Train:  49%|████▊     | 2437/5000 [9:43:17<10:10:29, 14.29s/it]Train:  49%|████▉     | 2438/5000 [9:43:31<10:10:10, 14.29s/it]Train:  49%|████▉     | 2439/5000 [9:43:45<10:09:33, 14.28s/it]Train:  49%|████▉     | 2440/5000 [9:44:00<10:09:39, 14.29s/it]                                                               {'loss': 1.69090824, 'token_acc': 0.62503312, 'grad_norm': 0.23699911, 'learning_rate': 1.122e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069605, 'epoch': 1.11, 'global_step/max_steps': '2440/5000', 'percentage': '48.80%', 'elapsed_time': '9h 44m 0s', 'remaining_time': '10h 12m 43s'}
+Train:  49%|████▉     | 2440/5000 [9:44:00<10:09:39, 14.29s/it]Train:  49%|████▉     | 2440/5000 [9:44:00<10:09:39, 14.29s/it]Train:  49%|████▉     | 2441/5000 [9:44:14<10:09:42, 14.30s/it]Train:  49%|████▉     | 2442/5000 [9:44:28<10:09:47, 14.30s/it]Train:  49%|████▉     | 2443/5000 [9:44:43<10:09:50, 14.31s/it]Train:  49%|████▉     | 2444/5000 [9:44:57<10:09:17, 14.30s/it]Train:  49%|████▉     | 2445/5000 [9:45:11<10:09:02, 14.30s/it]Train:  49%|████▉     | 2446/5000 [9:45:25<10:08:11, 14.29s/it]Train:  49%|████▉     | 2447/5000 [9:45:40<10:08:12, 14.29s/it]Train:  49%|████▉     | 2448/5000 [9:45:54<10:07:47, 14.29s/it]Train:  49%|████▉     | 2449/5000 [9:46:08<10:07:00, 14.28s/it]Train:  49%|████▉     | 2450/5000 [9:46:23<10:07:03, 14.28s/it]                                                               {'loss': 1.67889576, 'token_acc': 0.6356892, 'grad_norm': 0.23629452, 'learning_rate': 1.115e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069606, 'epoch': 1.11, 'global_step/max_steps': '2450/5000', 'percentage': '49.00%', 'elapsed_time': '9h 46m 23s', 'remaining_time': '10h 10m 19s'}
+Train:  49%|████▉     | 2450/5000 [9:46:23<10:07:03, 14.28s/it]Train:  49%|████▉     | 2450/5000 [9:46:23<10:07:03, 14.28s/it]Train:  49%|████▉     | 2451/5000 [9:46:37<10:07:35, 14.30s/it]Train:  49%|████▉     | 2452/5000 [9:46:51<10:07:16, 14.30s/it]Train:  49%|████▉     | 2453/5000 [9:47:06<10:06:57, 14.30s/it]Train:  49%|████▉     | 2454/5000 [9:47:20<10:06:30, 14.29s/it]Train:  49%|████▉     | 2455/5000 [9:47:34<10:06:32, 14.30s/it]Train:  49%|████▉     | 2456/5000 [9:47:48<10:06:29, 14.30s/it]Train:  49%|████▉     | 2457/5000 [9:48:03<10:06:23, 14.31s/it]Train:  49%|████▉     | 2458/5000 [9:48:17<10:06:14, 14.31s/it]Train:  49%|████▉     | 2459/5000 [9:48:31<10:05:47, 14.30s/it]Train:  49%|████▉     | 2460/5000 [9:48:46<10:05:53, 14.31s/it]                                                               {'loss': 1.68592873, 'token_acc': 0.63010703, 'grad_norm': 0.2432076, 'learning_rate': 1.109e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069607, 'epoch': 1.11, 'global_step/max_steps': '2460/5000', 'percentage': '49.20%', 'elapsed_time': '9h 48m 46s', 'remaining_time': '10h 7m 55s'}
+Train:  49%|████▉     | 2460/5000 [9:48:46<10:05:53, 14.31s/it]Train:  49%|████▉     | 2460/5000 [9:48:46<10:05:53, 14.31s/it]Train:  49%|████▉     | 2461/5000 [9:49:00<10:05:22, 14.31s/it]Train:  49%|████▉     | 2462/5000 [9:49:14<10:05:01, 14.30s/it]Train:  49%|████▉     | 2463/5000 [9:49:29<10:05:04, 14.31s/it]Train:  49%|████▉     | 2464/5000 [9:49:43<10:04:43, 14.31s/it]Train:  49%|████▉     | 2465/5000 [9:49:57<10:04:29, 14.31s/it]Train:  49%|████▉     | 2466/5000 [9:50:12<10:03:39, 14.29s/it]Train:  49%|████▉     | 2467/5000 [9:50:26<10:03:44, 14.30s/it]Train:  49%|████▉     | 2468/5000 [9:50:40<10:03:42, 14.31s/it]Train:  49%|████▉     | 2469/5000 [9:50:54<10:03:24, 14.30s/it]Train:  49%|████▉     | 2470/5000 [9:51:09<10:03:23, 14.31s/it]                                                               {'loss': 1.67183685, 'token_acc': 0.6345609, 'grad_norm': 0.23839647, 'learning_rate': 1.102e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069608, 'epoch': 1.11, 'global_step/max_steps': '2470/5000', 'percentage': '49.40%', 'elapsed_time': '9h 51m 9s', 'remaining_time': '10h 5m 30s'}
+Train:  49%|████▉     | 2470/5000 [9:51:09<10:03:23, 14.31s/it]Train:  49%|████▉     | 2470/5000 [9:51:09<10:03:23, 14.31s/it]Train:  49%|████▉     | 2471/5000 [9:51:23<10:03:15, 14.31s/it]Train:  49%|████▉     | 2472/5000 [9:51:37<10:02:46, 14.31s/it]Train:  49%|████▉     | 2473/5000 [9:51:52<10:02:14, 14.30s/it]Train:  49%|████▉     | 2474/5000 [9:52:06<10:01:29, 14.29s/it]Train:  50%|████▉     | 2475/5000 [9:52:20<10:01:20, 14.29s/it]Train:  50%|████▉     | 2476/5000 [9:52:34<10:00:52, 14.28s/it]Train:  50%|████▉     | 2477/5000 [9:52:49<10:00:51, 14.29s/it]Train:  50%|████▉     | 2478/5000 [9:53:03<10:01:17, 14.31s/it]Train:  50%|████▉     | 2479/5000 [9:53:17<10:00:33, 14.29s/it]Train:  50%|████▉     | 2480/5000 [9:53:32<9:59:55, 14.28s/it]                                                               {'loss': 1.66232433, 'token_acc': 0.63220876, 'grad_norm': 0.24124733, 'learning_rate': 1.096e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06961, 'epoch': 1.11, 'global_step/max_steps': '2480/5000', 'percentage': '49.60%', 'elapsed_time': '9h 53m 32s', 'remaining_time': '10h 3m 6s'}
+Train:  50%|████▉     | 2480/5000 [9:53:32<9:59:55, 14.28s/it]Train:  50%|████▉     | 2480/5000 [9:53:32<9:59:55, 14.28s/it]Train:  50%|████▉     | 2481/5000 [9:53:46<10:00:09, 14.30s/it]Train:  50%|████▉     | 2482/5000 [9:54:00<9:59:35, 14.29s/it] Train:  50%|████▉     | 2483/5000 [9:54:15<9:59:06, 14.28s/it]Train:  50%|████▉     | 2484/5000 [9:54:29<9:58:34, 14.27s/it]Train:  50%|████▉     | 2485/5000 [9:54:43<9:58:34, 14.28s/it]Train:  50%|████▉     | 2486/5000 [9:54:57<9:58:24, 14.28s/it]Train:  50%|████▉     | 2487/5000 [9:55:12<9:58:26, 14.29s/it]Train:  50%|████▉     | 2488/5000 [9:55:26<9:57:55, 14.28s/it]Train:  50%|████▉     | 2489/5000 [9:55:40<9:57:52, 14.29s/it]Train:  50%|████▉     | 2490/5000 [9:55:54<9:57:25, 14.28s/it]                                                              {'loss': 1.67429218, 'token_acc': 0.63460631, 'grad_norm': 0.24363814, 'learning_rate': 1.089e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069612, 'epoch': 1.12, 'global_step/max_steps': '2490/5000', 'percentage': '49.80%', 'elapsed_time': '9h 55m 54s', 'remaining_time': '10h 0m 42s'}
+Train:  50%|████▉     | 2490/5000 [9:55:54<9:57:25, 14.28s/it]Train:  50%|████▉     | 2490/5000 [9:55:54<9:57:25, 14.28s/it]Train:  50%|████▉     | 2491/5000 [9:56:09<9:57:04, 14.28s/it]Train:  50%|████▉     | 2492/5000 [9:56:23<9:57:04, 14.28s/it]Train:  50%|████▉     | 2493/5000 [9:56:37<9:56:34, 14.28s/it]Train:  50%|████▉     | 2494/5000 [9:56:52<9:55:48, 14.27s/it]Train:  50%|████▉     | 2495/5000 [9:57:06<9:55:25, 14.26s/it]Train:  50%|████▉     | 2496/5000 [9:57:20<9:55:30, 14.27s/it]Train:  50%|████▉     | 2497/5000 [9:57:34<9:55:21, 14.27s/it]Train:  50%|████▉     | 2498/5000 [9:57:49<9:55:32, 14.28s/it]Train:  50%|████▉     | 2499/5000 [9:58:03<9:55:13, 14.28s/it]Train:  50%|█████     | 2500/5000 [9:58:17<9:54:42, 14.27s/it]                                                              {'loss': 1.67177429, 'token_acc': 0.62792872, 'grad_norm': 0.23944354, 'learning_rate': 1.083e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069613, 'epoch': 1.12, 'global_step/max_steps': '2500/5000', 'percentage': '50.00%', 'elapsed_time': '9h 58m 17s', 'remaining_time': '9h 58m 17s'}
+Train:  50%|█████     | 2500/5000 [9:58:17<9:54:42, 14.27s/it]Train:  50%|█████     | 2500/5000 [9:58:17<9:54:42, 14.27s/it]Train:  50%|█████     | 2501/5000 [9:58:31<9:54:06, 14.26s/it]Train:  50%|█████     | 2502/5000 [9:58:46<9:54:07, 14.27s/it]Train:  50%|█████     | 2503/5000 [9:59:00<9:54:19, 14.28s/it]Train:  50%|█████     | 2504/5000 [9:59:14<9:53:39, 14.27s/it]Train:  50%|█████     | 2505/5000 [9:59:29<9:53:12, 14.27s/it]Train:  50%|█████     | 2506/5000 [9:59:43<9:52:39, 14.26s/it]Train:  50%|█████     | 2507/5000 [9:59:57<9:52:30, 14.26s/it]Train:  50%|█████     | 2508/5000 [10:00:11<9:52:24, 14.26s/it]Train:  50%|█████     | 2509/5000 [10:00:26<9:52:28, 14.27s/it]Train:  50%|█████     | 2510/5000 [10:00:40<9:52:17, 14.27s/it]                                                               {'loss': 1.66981544, 'token_acc': 0.63324596, 'grad_norm': 0.23829199, 'learning_rate': 1.076e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069615, 'epoch': 1.12, 'global_step/max_steps': '2510/5000', 'percentage': '50.20%', 'elapsed_time': '10h 0m 40s', 'remaining_time': '9h 55m 53s'}
+Train:  50%|█████     | 2510/5000 [10:00:40<9:52:17, 14.27s/it]Train:  50%|█████     | 2510/5000 [10:00:40<9:52:17, 14.27s/it]Train:  50%|█████     | 2511/5000 [10:00:54<9:51:28, 14.26s/it]Train:  50%|█████     | 2512/5000 [10:01:08<9:51:37, 14.27s/it]Train:  50%|█████     | 2513/5000 [10:01:23<9:51:25, 14.27s/it]Train:  50%|█████     | 2514/5000 [10:01:37<9:50:48, 14.26s/it]Train:  50%|█████     | 2515/5000 [10:01:51<9:50:48, 14.26s/it]Train:  50%|█████     | 2516/5000 [10:02:05<9:50:45, 14.27s/it]Train:  50%|█████     | 2517/5000 [10:02:20<9:50:55, 14.28s/it]Train:  50%|█████     | 2518/5000 [10:02:34<9:50:36, 14.28s/it]Train:  50%|█████     | 2519/5000 [10:02:48<9:50:35, 14.28s/it]Train:  50%|█████     | 2520/5000 [10:03:03<9:52:17, 14.33s/it]                                                               {'loss': 1.67606888, 'token_acc': 0.63254738, 'grad_norm': 0.24341117, 'learning_rate': 1.069e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069617, 'epoch': 1.12, 'global_step/max_steps': '2520/5000', 'percentage': '50.40%', 'elapsed_time': '10h 3m 3s', 'remaining_time': '9h 53m 28s'}
+Train:  50%|█████     | 2520/5000 [10:03:03<9:52:17, 14.33s/it]Train:  50%|█████     | 2520/5000 [10:03:03<9:52:17, 14.33s/it]Train:  50%|█████     | 2521/5000 [10:03:17<9:50:41, 14.30s/it]Train:  50%|█████     | 2522/5000 [10:03:31<9:49:58, 14.29s/it]Train:  50%|█████     | 2523/5000 [10:03:46<9:49:34, 14.28s/it]Train:  50%|█████     | 2524/5000 [10:04:00<9:49:03, 14.27s/it]Train:  50%|█████     | 2525/5000 [10:04:14<9:49:16, 14.29s/it]Train:  51%|█████     | 2526/5000 [10:04:28<9:48:54, 14.28s/it]Train:  51%|█████     | 2527/5000 [10:04:43<9:48:20, 14.27s/it]Train:  51%|█████     | 2528/5000 [10:04:57<9:48:06, 14.27s/it]Train:  51%|█████     | 2529/5000 [10:05:11<9:48:10, 14.28s/it]Train:  51%|█████     | 2530/5000 [10:05:25<9:47:46, 14.28s/it]                                                               {'loss': 1.67553444, 'token_acc': 0.63111571, 'grad_norm': 0.24607843, 'learning_rate': 1.063e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069619, 'epoch': 1.12, 'global_step/max_steps': '2530/5000', 'percentage': '50.60%', 'elapsed_time': '10h 5m 25s', 'remaining_time': '9h 51m 4s'}
+Train:  51%|█████     | 2530/5000 [10:05:25<9:47:46, 14.28s/it]Train:  51%|█████     | 2530/5000 [10:05:25<9:47:46, 14.28s/it]Train:  51%|█████     | 2531/5000 [10:05:40<9:47:59, 14.29s/it]Train:  51%|█████     | 2532/5000 [10:05:54<9:47:46, 14.29s/it]Train:  51%|█████     | 2533/5000 [10:06:08<9:47:37, 14.29s/it]Train:  51%|█████     | 2534/5000 [10:06:23<9:46:34, 14.27s/it]Train:  51%|█████     | 2535/5000 [10:06:37<9:45:59, 14.26s/it]Train:  51%|█████     | 2536/5000 [10:06:51<9:45:52, 14.27s/it]Train:  51%|█████     | 2537/5000 [10:07:05<9:45:23, 14.26s/it]Train:  51%|█████     | 2538/5000 [10:07:20<9:44:55, 14.25s/it]Train:  51%|█████     | 2539/5000 [10:07:34<9:44:24, 14.25s/it]Train:  51%|█████     | 2540/5000 [10:07:48<9:43:58, 14.24s/it]                                                               {'loss': 1.68528004, 'token_acc': 0.62773108, 'grad_norm': 0.2384171, 'learning_rate': 1.056e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06962, 'epoch': 1.13, 'global_step/max_steps': '2540/5000', 'percentage': '50.80%', 'elapsed_time': '10h 7m 48s', 'remaining_time': '9h 48m 39s'}
+Train:  51%|█████     | 2540/5000 [10:07:48<9:43:58, 14.24s/it]Train:  51%|█████     | 2540/5000 [10:07:48<9:43:58, 14.24s/it]Train:  51%|█████     | 2541/5000 [10:08:02<9:44:05, 14.25s/it]Train:  51%|█████     | 2542/5000 [10:08:17<9:43:48, 14.25s/it]Train:  51%|█████     | 2543/5000 [10:08:31<9:43:55, 14.26s/it]Train:  51%|█████     | 2544/5000 [10:08:45<9:44:17, 14.27s/it]Train:  51%|█████     | 2545/5000 [10:08:59<9:44:27, 14.28s/it]Train:  51%|█████     | 2546/5000 [10:09:14<9:44:20, 14.29s/it]Train:  51%|█████     | 2547/5000 [10:09:28<9:44:22, 14.29s/it]Train:  51%|█████     | 2548/5000 [10:09:42<9:44:25, 14.30s/it]Train:  51%|█████     | 2549/5000 [10:09:57<9:43:37, 14.29s/it]Train:  51%|█████     | 2550/5000 [10:10:11<9:43:35, 14.29s/it]                                                               {'loss': 1.6767868, 'token_acc': 0.62476092, 'grad_norm': 0.24214213, 'learning_rate': 1.05e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069622, 'epoch': 1.13, 'global_step/max_steps': '2550/5000', 'percentage': '51.00%', 'elapsed_time': '10h 10m 11s', 'remaining_time': '9h 46m 15s'}
+Train:  51%|█████     | 2550/5000 [10:10:11<9:43:35, 14.29s/it]Train:  51%|█████     | 2550/5000 [10:10:11<9:43:35, 14.29s/it]Train:  51%|█████     | 2551/5000 [10:10:25<9:43:18, 14.29s/it]Train:  51%|█████     | 2552/5000 [10:10:40<9:43:30, 14.30s/it]Train:  51%|█████     | 2553/5000 [10:10:54<9:42:49, 14.29s/it]Train:  51%|█████     | 2554/5000 [10:11:08<9:42:54, 14.30s/it]Train:  51%|█████     | 2555/5000 [10:11:23<9:43:15, 14.31s/it]Train:  51%|█████     | 2556/5000 [10:11:37<9:43:03, 14.31s/it]Train:  51%|█████     | 2557/5000 [10:11:51<9:42:37, 14.31s/it]Train:  51%|█████     | 2558/5000 [10:12:05<9:42:18, 14.31s/it]Train:  51%|█████     | 2559/5000 [10:12:20<9:41:53, 14.30s/it]Train:  51%|█████     | 2560/5000 [10:12:34<9:40:55, 14.28s/it]                                                               {'loss': 1.67695637, 'token_acc': 0.63218082, 'grad_norm': 0.23762341, 'learning_rate': 1.043e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069623, 'epoch': 1.13, 'global_step/max_steps': '2560/5000', 'percentage': '51.20%', 'elapsed_time': '10h 12m 34s', 'remaining_time': '9h 43m 51s'}
+Train:  51%|█████     | 2560/5000 [10:12:34<9:40:55, 14.28s/it]Train:  51%|█████     | 2560/5000 [10:12:34<9:40:55, 14.28s/it]Train:  51%|█████     | 2561/5000 [10:12:48<9:40:48, 14.29s/it]Train:  51%|█████     | 2562/5000 [10:13:03<9:40:57, 14.30s/it]Train:  51%|█████▏    | 2563/5000 [10:13:17<9:40:54, 14.30s/it]Train:  51%|█████▏    | 2564/5000 [10:13:31<9:40:41, 14.30s/it]Train:  51%|█████▏    | 2565/5000 [10:13:46<9:40:46, 14.31s/it]Train:  51%|█████▏    | 2566/5000 [10:14:00<9:40:27, 14.31s/it]Train:  51%|█████▏    | 2567/5000 [10:14:14<9:40:01, 14.30s/it]Train:  51%|█████▏    | 2568/5000 [10:14:28<9:39:45, 14.30s/it]Train:  51%|█████▏    | 2569/5000 [10:14:43<9:39:21, 14.30s/it]Train:  51%|█████▏    | 2570/5000 [10:14:57<9:38:56, 14.29s/it]                                                               {'loss': 1.68409843, 'token_acc': 0.63115322, 'grad_norm': 0.23605612, 'learning_rate': 1.036e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069624, 'epoch': 1.13, 'global_step/max_steps': '2570/5000', 'percentage': '51.40%', 'elapsed_time': '10h 14m 57s', 'remaining_time': '9h 41m 27s'}
+Train:  51%|█████▏    | 2570/5000 [10:14:57<9:38:56, 14.29s/it]Train:  51%|█████▏    | 2570/5000 [10:14:57<9:38:56, 14.29s/it]Train:  51%|█████▏    | 2571/5000 [10:15:11<9:38:50, 14.30s/it]Train:  51%|█████▏    | 2572/5000 [10:15:26<9:38:37, 14.30s/it]Train:  51%|█████▏    | 2573/5000 [10:15:40<9:38:28, 14.30s/it]Train:  51%|█████▏    | 2574/5000 [10:15:54<9:38:04, 14.30s/it]Train:  52%|█████▏    | 2575/5000 [10:16:08<9:37:08, 14.28s/it]Train:  52%|█████▏    | 2576/5000 [10:16:23<9:37:06, 14.28s/it]Train:  52%|█████▏    | 2577/5000 [10:16:37<9:37:35, 14.30s/it]Train:  52%|█████▏    | 2578/5000 [10:16:51<9:36:30, 14.28s/it]Train:  52%|█████▏    | 2579/5000 [10:17:06<9:36:47, 14.29s/it]Train:  52%|█████▏    | 2580/5000 [10:17:20<9:36:11, 14.29s/it]                                                               {'loss': 1.67265854, 'token_acc': 0.62655603, 'grad_norm': 0.24530908, 'learning_rate': 1.03e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069626, 'epoch': 1.13, 'global_step/max_steps': '2580/5000', 'percentage': '51.60%', 'elapsed_time': '10h 17m 20s', 'remaining_time': '9h 39m 3s'}
+Train:  52%|█████▏    | 2580/5000 [10:17:20<9:36:11, 14.29s/it]Train:  52%|█████▏    | 2580/5000 [10:17:20<9:36:11, 14.29s/it]Train:  52%|█████▏    | 2581/5000 [10:17:34<9:35:43, 14.28s/it]Train:  52%|█████▏    | 2582/5000 [10:17:48<9:35:30, 14.28s/it]Train:  52%|█████▏    | 2583/5000 [10:18:03<9:35:44, 14.29s/it]Train:  52%|█████▏    | 2584/5000 [10:18:17<9:35:03, 14.28s/it]Train:  52%|█████▏    | 2585/5000 [10:18:31<9:34:29, 14.27s/it]Train:  52%|█████▏    | 2586/5000 [10:18:46<9:34:17, 14.27s/it]Train:  52%|█████▏    | 2587/5000 [10:19:00<9:34:18, 14.28s/it]Train:  52%|█████▏    | 2588/5000 [10:19:14<9:34:06, 14.28s/it]Train:  52%|█████▏    | 2589/5000 [10:19:28<9:33:30, 14.27s/it]Train:  52%|█████▏    | 2590/5000 [10:19:43<9:33:06, 14.27s/it]                                                               {'loss': 1.67070389, 'token_acc': 0.62900318, 'grad_norm': 0.24204104, 'learning_rate': 1.023e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069627, 'epoch': 1.14, 'global_step/max_steps': '2590/5000', 'percentage': '51.80%', 'elapsed_time': '10h 19m 43s', 'remaining_time': '9h 36m 38s'}
+Train:  52%|█████▏    | 2590/5000 [10:19:43<9:33:06, 14.27s/it]Train:  52%|█████▏    | 2590/5000 [10:19:43<9:33:06, 14.27s/it]Train:  52%|█████▏    | 2591/5000 [10:19:57<9:33:47, 14.29s/it]Train:  52%|█████▏    | 2592/5000 [10:20:11<9:33:42, 14.30s/it]Train:  52%|█████▏    | 2593/5000 [10:20:26<9:32:38, 14.27s/it]Train:  52%|█████▏    | 2594/5000 [10:20:40<9:32:32, 14.28s/it]Train:  52%|█████▏    | 2595/5000 [10:20:54<9:32:24, 14.28s/it]Train:  52%|█████▏    | 2596/5000 [10:21:08<9:31:47, 14.27s/it]Train:  52%|█████▏    | 2597/5000 [10:21:23<9:31:18, 14.26s/it]Train:  52%|█████▏    | 2598/5000 [10:21:37<9:31:28, 14.27s/it]Train:  52%|█████▏    | 2599/5000 [10:21:51<9:30:54, 14.27s/it]Train:  52%|█████▏    | 2600/5000 [10:22:05<9:30:42, 14.27s/it]                                                               {'loss': 1.66730728, 'token_acc': 0.63389219, 'grad_norm': 0.24445403, 'learning_rate': 1.017e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069629, 'epoch': 1.14, 'global_step/max_steps': '2600/5000', 'percentage': '52.00%', 'elapsed_time': '10h 22m 5s', 'remaining_time': '9h 34m 14s'}
+Train:  52%|█████▏    | 2600/5000 [10:22:05<9:30:42, 14.27s/it]Train:  52%|█████▏    | 2600/5000 [10:22:05<9:30:42, 14.27s/it]Train:  52%|█████▏    | 2601/5000 [10:22:20<9:30:44, 14.27s/it]Train:  52%|█████▏    | 2602/5000 [10:22:34<9:30:14, 14.27s/it]Train:  52%|█████▏    | 2603/5000 [10:22:48<9:30:00, 14.27s/it]Train:  52%|█████▏    | 2604/5000 [10:23:02<9:29:53, 14.27s/it]Train:  52%|█████▏    | 2605/5000 [10:23:17<9:29:35, 14.27s/it]Train:  52%|█████▏    | 2606/5000 [10:23:31<9:29:12, 14.27s/it]Train:  52%|█████▏    | 2607/5000 [10:23:45<9:29:12, 14.27s/it]Train:  52%|█████▏    | 2608/5000 [10:24:00<9:29:02, 14.27s/it]Train:  52%|█████▏    | 2609/5000 [10:24:14<9:29:09, 14.28s/it]Train:  52%|█████▏    | 2610/5000 [10:24:28<9:29:14, 14.29s/it]                                                               {'loss': 1.67950058, 'token_acc': 0.63399754, 'grad_norm': 0.23875895, 'learning_rate': 1.01e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06963, 'epoch': 1.14, 'global_step/max_steps': '2610/5000', 'percentage': '52.20%', 'elapsed_time': '10h 24m 28s', 'remaining_time': '9h 31m 50s'}
+Train:  52%|█████▏    | 2610/5000 [10:24:28<9:29:14, 14.29s/it]Train:  52%|█████▏    | 2610/5000 [10:24:28<9:29:14, 14.29s/it]Train:  52%|█████▏    | 2611/5000 [10:24:43<9:29:15, 14.30s/it]Train:  52%|█████▏    | 2612/5000 [10:24:57<9:29:08, 14.30s/it]Train:  52%|█████▏    | 2613/5000 [10:25:11<9:28:50, 14.30s/it]Train:  52%|█████▏    | 2614/5000 [10:25:25<9:28:17, 14.29s/it]Train:  52%|█████▏    | 2615/5000 [10:25:40<9:28:05, 14.29s/it]Train:  52%|█████▏    | 2616/5000 [10:25:54<9:27:00, 14.27s/it]Train:  52%|█████▏    | 2617/5000 [10:26:08<9:26:42, 14.27s/it]Train:  52%|█████▏    | 2618/5000 [10:26:22<9:26:31, 14.27s/it]Train:  52%|█████▏    | 2619/5000 [10:26:37<9:26:13, 14.27s/it]Train:  52%|█████▏    | 2620/5000 [10:26:51<9:25:42, 14.26s/it]                                                               {'loss': 1.67020645, 'token_acc': 0.62996566, 'grad_norm': 0.23416026, 'learning_rate': 1.003e-05, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069632, 'epoch': 1.14, 'global_step/max_steps': '2620/5000', 'percentage': '52.40%', 'elapsed_time': '10h 26m 51s', 'remaining_time': '9h 29m 26s'}
+Train:  52%|█████▏    | 2620/5000 [10:26:51<9:25:42, 14.26s/it]Train:  52%|█████▏    | 2620/5000 [10:26:51<9:25:42, 14.26s/it]Train:  52%|█████▏    | 2621/5000 [10:27:05<9:25:56, 14.27s/it]Train:  52%|█████▏    | 2622/5000 [10:27:20<9:25:29, 14.27s/it]Train:  52%|█████▏    | 2623/5000 [10:27:34<9:25:47, 14.28s/it]Train:  52%|█████▏    | 2624/5000 [10:27:48<9:25:52, 14.29s/it]Train:  52%|█████▎    | 2625/5000 [10:28:02<9:25:28, 14.29s/it]Train:  53%|█████▎    | 2626/5000 [10:28:17<9:24:59, 14.28s/it]Train:  53%|█████▎    | 2627/5000 [10:28:31<9:24:08, 14.26s/it]Train:  53%|█████▎    | 2628/5000 [10:28:45<9:23:42, 14.26s/it]Train:  53%|█████▎    | 2629/5000 [10:28:59<9:23:51, 14.27s/it]Train:  53%|█████▎    | 2630/5000 [10:29:14<9:23:20, 14.26s/it]                                                               {'loss': 1.66840439, 'token_acc': 0.63226376, 'grad_norm': 0.24525027, 'learning_rate': 9.97e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069634, 'epoch': 1.14, 'global_step/max_steps': '2630/5000', 'percentage': '52.60%', 'elapsed_time': '10h 29m 14s', 'remaining_time': '9h 27m 1s'}
+Train:  53%|█████▎    | 2630/5000 [10:29:14<9:23:20, 14.26s/it]Train:  53%|█████▎    | 2630/5000 [10:29:14<9:23:20, 14.26s/it]Train:  53%|█████▎    | 2631/5000 [10:29:28<9:23:30, 14.27s/it]Train:  53%|█████▎    | 2632/5000 [10:29:42<9:23:15, 14.27s/it]Train:  53%|█████▎    | 2633/5000 [10:29:56<9:22:45, 14.27s/it]Train:  53%|█████▎    | 2634/5000 [10:30:11<9:23:11, 14.28s/it]Train:  53%|█████▎    | 2635/5000 [10:30:25<9:22:58, 14.28s/it]Train:  53%|█████▎    | 2636/5000 [10:30:39<9:23:20, 14.30s/it]Train:  53%|█████▎    | 2637/5000 [10:30:54<9:22:41, 14.29s/it]Train:  53%|█████▎    | 2638/5000 [10:31:08<9:22:22, 14.29s/it]Train:  53%|█████▎    | 2639/5000 [10:31:22<9:21:46, 14.28s/it]Train:  53%|█████▎    | 2640/5000 [10:31:37<9:21:49, 14.28s/it]                                                               {'loss': 1.67907944, 'token_acc': 0.62846669, 'grad_norm': 0.23776421, 'learning_rate': 9.9e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069635, 'epoch': 1.15, 'global_step/max_steps': '2640/5000', 'percentage': '52.80%', 'elapsed_time': '10h 31m 37s', 'remaining_time': '9h 24m 37s'}
+Train:  53%|█████▎    | 2640/5000 [10:31:37<9:21:49, 14.28s/it]Train:  53%|█████▎    | 2640/5000 [10:31:37<9:21:49, 14.28s/it]Train:  53%|█████▎    | 2641/5000 [10:31:51<9:21:44, 14.29s/it]Train:  53%|█████▎    | 2642/5000 [10:32:05<9:21:13, 14.28s/it]Train:  53%|█████▎    | 2643/5000 [10:32:19<9:20:50, 14.28s/it]Train:  53%|█████▎    | 2644/5000 [10:32:34<9:20:54, 14.28s/it]Train:  53%|█████▎    | 2645/5000 [10:32:48<9:20:41, 14.29s/it]Train:  53%|█████▎    | 2646/5000 [10:33:02<9:20:35, 14.29s/it]Train:  53%|█████▎    | 2647/5000 [10:33:17<9:20:20, 14.29s/it]Train:  53%|█████▎    | 2648/5000 [10:33:31<9:20:20, 14.29s/it]Train:  53%|█████▎    | 2649/5000 [10:33:45<9:19:56, 14.29s/it]Train:  53%|█████▎    | 2650/5000 [10:33:59<9:19:39, 14.29s/it]                                                               {'loss': 1.67400589, 'token_acc': 0.63478805, 'grad_norm': 0.23670979, 'learning_rate': 9.83e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069636, 'epoch': 1.15, 'global_step/max_steps': '2650/5000', 'percentage': '53.00%', 'elapsed_time': '10h 33m 59s', 'remaining_time': '9h 22m 13s'}
+Train:  53%|█████▎    | 2650/5000 [10:33:59<9:19:39, 14.29s/it]Train:  53%|█████▎    | 2650/5000 [10:33:59<9:19:39, 14.29s/it]Train:  53%|█████▎    | 2651/5000 [10:34:14<9:19:14, 14.28s/it]Train:  53%|█████▎    | 2652/5000 [10:34:28<9:18:28, 14.27s/it]Train:  53%|█████▎    | 2653/5000 [10:34:42<9:17:50, 14.26s/it]Train:  53%|█████▎    | 2654/5000 [10:34:56<9:18:13, 14.28s/it]Train:  53%|█████▎    | 2655/5000 [10:35:11<9:17:39, 14.27s/it]Train:  53%|█████▎    | 2656/5000 [10:35:25<9:16:46, 14.25s/it]Train:  53%|█████▎    | 2657/5000 [10:35:39<9:17:11, 14.27s/it]Train:  53%|█████▎    | 2658/5000 [10:35:54<9:17:16, 14.28s/it]Train:  53%|█████▎    | 2659/5000 [10:36:08<9:17:29, 14.29s/it]Train:  53%|█████▎    | 2660/5000 [10:36:22<9:17:20, 14.29s/it]                                                               {'loss': 1.67631855, 'token_acc': 0.63390145, 'grad_norm': 0.23990001, 'learning_rate': 9.77e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069638, 'epoch': 1.15, 'global_step/max_steps': '2660/5000', 'percentage': '53.20%', 'elapsed_time': '10h 36m 22s', 'remaining_time': '9h 19m 49s'}
+Train:  53%|██��██▎    | 2660/5000 [10:36:22<9:17:20, 14.29s/it]Train:  53%|█████▎    | 2660/5000 [10:36:22<9:17:20, 14.29s/it]Train:  53%|█████▎    | 2661/5000 [10:36:36<9:16:53, 14.29s/it]Train:  53%|█████▎    | 2662/5000 [10:36:51<9:16:45, 14.29s/it]Train:  53%|█████▎    | 2663/5000 [10:37:05<9:16:03, 14.28s/it]Train:  53%|█████▎    | 2664/5000 [10:37:19<9:16:15, 14.29s/it]Train:  53%|█████▎    | 2665/5000 [10:37:34<9:15:50, 14.28s/it]Train:  53%|█████▎    | 2666/5000 [10:37:48<9:15:03, 14.27s/it]Train:  53%|█████▎    | 2667/5000 [10:38:02<9:15:28, 14.29s/it]Train:  53%|█████▎    | 2668/5000 [10:38:16<9:15:14, 14.29s/it]Train:  53%|█████▎    | 2669/5000 [10:38:31<9:15:01, 14.29s/it]Train:  53%|█████▎    | 2670/5000 [10:38:45<9:14:04, 14.27s/it]                                                               {'loss': 1.67794685, 'token_acc': 0.63206807, 'grad_norm': 0.23685615, 'learning_rate': 9.7e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069639, 'epoch': 1.15, 'global_step/max_steps': '2670/5000', 'percentage': '53.40%', 'elapsed_time': '10h 38m 45s', 'remaining_time': '9h 17m 25s'}
+Train:  53%|█████▎    | 2670/5000 [10:38:45<9:14:04, 14.27s/it]Train:  53%|█████▎    | 2670/5000 [10:38:45<9:14:04, 14.27s/it]Train:  53%|█████▎    | 2671/5000 [10:38:59<9:14:12, 14.28s/it]Train:  53%|█████▎    | 2672/5000 [10:39:13<9:13:36, 14.27s/it]Train:  53%|█████▎    | 2673/5000 [10:39:28<9:13:29, 14.27s/it]Train:  53%|█████▎    | 2674/5000 [10:39:42<9:13:28, 14.28s/it]Train:  54%|█████▎    | 2675/5000 [10:39:56<9:13:20, 14.28s/it]Train:  54%|█████▎    | 2676/5000 [10:40:11<9:12:27, 14.26s/it]Train:  54%|█████▎    | 2677/5000 [10:40:25<9:12:08, 14.26s/it]Train:  54%|█████▎    | 2678/5000 [10:40:39<9:12:09, 14.27s/it]Train:  54%|█████▎    | 2679/5000 [10:40:53<9:11:38, 14.26s/it]Train:  54%|█████▎    | 2680/5000 [10:41:08<9:11:46, 14.27s/it]                                                               {'loss': 1.67809868, 'token_acc': 0.63344637, 'grad_norm': 0.23960878, 'learning_rate': 9.64e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069641, 'epoch': 1.15, 'global_step/max_steps': '2680/5000', 'percentage': '53.60%', 'elapsed_time': '10h 41m 8s', 'remaining_time': '9h 15m 0s'}
+Train:  54%|█████▎    | 2680/5000 [10:41:08<9:11:46, 14.27s/it]Train:  54%|█████▎    | 2680/5000 [10:41:08<9:11:46, 14.27s/it]Train:  54%|█████▎    | 2681/5000 [10:41:22<9:11:42, 14.27s/it]Train:  54%|█████▎    | 2682/5000 [10:41:36<9:11:39, 14.28s/it]Train:  54%|█████▎    | 2683/5000 [10:41:51<9:11:53, 14.29s/it]Train:  54%|█████▎    | 2684/5000 [10:42:05<9:11:34, 14.29s/it]Train:  54%|█████▎    | 2685/5000 [10:42:19<9:10:58, 14.28s/it]Train:  54%|█████▎    | 2686/5000 [10:42:33<9:10:55, 14.28s/it]Train:  54%|█████▎    | 2687/5000 [10:42:48<9:11:04, 14.29s/it]Train:  54%|█████▍    | 2688/5000 [10:43:02<9:10:54, 14.30s/it]Train:  54%|█████▍    | 2689/5000 [10:43:16<9:10:00, 14.28s/it]Train:  54%|█████▍    | 2690/5000 [10:43:30<9:09:24, 14.27s/it]                                                               {'loss': 1.67390614, 'token_acc': 0.63612675, 'grad_norm': 0.23954283, 'learning_rate': 9.57e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069642, 'epoch': 1.16, 'global_step/max_steps': '2690/5000', 'percentage': '53.80%', 'elapsed_time': '10h 43m 30s', 'remaining_time': '9h 12m 36s'}
+Train:  54%|█████▍    | 2690/5000 [10:43:30<9:09:24, 14.27s/it]Train:  54%|█████▍    | 2690/5000 [10:43:30<9:09:24, 14.27s/it]Train:  54%|█████▍    | 2691/5000 [10:43:45<9:08:56, 14.26s/it]Train:  54%|█████▍    | 2692/5000 [10:43:59<9:08:39, 14.26s/it]Train:  54%|█████▍    | 2693/5000 [10:44:13<9:07:59, 14.25s/it]Train:  54%|█████▍    | 2694/5000 [10:44:27<9:07:45, 14.25s/it]Train:  54%|█████▍    | 2695/5000 [10:44:42<9:07:47, 14.26s/it]Train:  54%|█████▍    | 2696/5000 [10:44:56<9:07:15, 14.25s/it]Train:  54%|█████▍    | 2697/5000 [10:45:10<9:06:42, 14.24s/it]Train:  54%|█████▍    | 2698/5000 [10:45:24<9:06:34, 14.25s/it]Train:  54%|█████▍    | 2699/5000 [10:45:39<9:06:11, 14.24s/it]Train:  54%|█████▍    | 2700/5000 [10:45:53<9:06:26, 14.25s/it]                                                               {'loss': 1.67704372, 'token_acc': 0.62767595, 'grad_norm': 0.23547032, 'learning_rate': 9.5e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069644, 'epoch': 1.16, 'global_step/max_steps': '2700/5000', 'percentage': '54.00%', 'elapsed_time': '10h 45m 53s', 'remaining_time': '9h 10m 12s'}
+Train:  54%|█████▍    | 2700/5000 [10:45:53<9:06:26, 14.25s/it]Train:  54%|█████▍    | 2700/5000 [10:45:53<9:06:26, 14.25s/it]Train:  54%|█████▍    | 2701/5000 [10:46:07<9:06:05, 14.25s/it]Train:  54%|█████▍    | 2702/5000 [10:46:22<9:06:26, 14.27s/it]Train:  54%|█████▍    | 2703/5000 [10:46:36<9:06:12, 14.27s/it]Train:  54%|█████▍    | 2704/5000 [10:46:50<9:05:43, 14.26s/it]Train:  54%|█████▍    | 2705/5000 [10:47:04<9:05:32, 14.26s/it]Train:  54%|█████▍    | 2706/5000 [10:47:19<9:05:42, 14.27s/it]Train:  54%|█████▍    | 2707/5000 [10:47:33<9:05:30, 14.27s/it]Train:  54%|█████▍    | 2708/5000 [10:47:47<9:05:09, 14.27s/it]Train:  54%|█████▍    | 2709/5000 [10:48:01<9:04:49, 14.27s/it]Train:  54%|█████▍    | 2710/5000 [10:48:16<9:04:34, 14.27s/it]                                                               {'loss': 1.6803484, 'token_acc': 0.63125642, 'grad_norm': 0.23838556, 'learning_rate': 9.44e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069646, 'epoch': 1.16, 'global_step/max_steps': '2710/5000', 'percentage': '54.20%', 'elapsed_time': '10h 48m 16s', 'remaining_time': '9h 7m 47s'}
+Train:  54%|█████▍    | 2710/5000 [10:48:16<9:04:34, 14.27s/it]Train:  54%|█████▍    | 2710/5000 [10:48:16<9:04:34, 14.27s/it]Train:  54%|█████▍    | 2711/5000 [10:48:30<9:04:23, 14.27s/it]Train:  54%|█████▍    | 2712/5000 [10:48:44<9:04:06, 14.27s/it]Train:  54%|█████▍    | 2713/5000 [10:48:58<9:04:00, 14.27s/it]Train:  54%|█████▍    | 2714/5000 [10:49:13<9:03:46, 14.27s/it]Train:  54%|█████▍    | 2715/5000 [10:49:27<9:03:22, 14.27s/it]Train:  54%|█████▍    | 2716/5000 [10:49:41<9:03:11, 14.27s/it]Train:  54%|█████▍    | 2717/5000 [10:49:56<9:04:08, 14.30s/it]Train:  54%|█████▍    | 2718/5000 [10:50:10<9:03:15, 14.28s/it]Train:  54%|█████▍    | 2719/5000 [10:50:24<9:02:50, 14.28s/it]Train:  54%|█████▍    | 2720/5000 [10:50:38<9:02:29, 14.28s/it]                                                               {'loss': 1.66865311, 'token_acc': 0.63012049, 'grad_norm': 0.24236213, 'learning_rate': 9.37e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069647, 'epoch': 1.16, 'global_step/max_steps': '2720/5000', 'percentage': '54.40%', 'elapsed_time': '10h 50m 38s', 'remaining_time': '9h 5m 23s'}
+Train:  54%|█████▍    | 2720/5000 [10:50:38<9:02:29, 14.28s/it]Train:  54%|█████▍    | 2720/5000 [10:50:38<9:02:29, 14.28s/it]Train:  54%|█████▍    | 2721/5000 [10:50:53<9:02:26, 14.28s/it]Train:  54%|█████▍    | 2722/5000 [10:51:07<9:02:17, 14.28s/it]Train:  54%|█████▍    | 2723/5000 [10:51:21<9:02:05, 14.28s/it]Train:  54%|█████▍    | 2724/5000 [10:51:36<9:01:34, 14.28s/it]Train:  55%|█████▍    | 2725/5000 [10:51:50<9:01:10, 14.27s/it]Train:  55%|█████▍    | 2726/5000 [10:52:04<9:00:43, 14.27s/it]Train:  55%|█████▍    | 2727/5000 [10:52:18<9:00:24, 14.27s/it]Train:  55%|█████▍    | 2728/5000 [10:52:33<9:00:45, 14.28s/it]Train:  55%|█████▍    | 2729/5000 [10:52:47<9:00:26, 14.28s/it]Train:  55%|█████▍    | 2730/5000 [10:53:01<9:00:25, 14.28s/it]                                                               {'loss': 1.67257957, 'token_acc': 0.63468372, 'grad_norm': 0.24073161, 'learning_rate': 9.31e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069649, 'epoch': 1.16, 'global_step/max_steps': '2730/5000', 'percentage': '54.60%', 'elapsed_time': '10h 53m 1s', 'remaining_time': '9h 2m 59s'}
+Train:  55%|█████▍    | 2730/5000 [10:53:01<9:00:25, 14.28s/it]Train:  55%|█████▍    | 2730/5000 [10:53:01<9:00:25, 14.28s/it]Train:  55%|█████▍    | 2731/5000 [10:53:16<9:00:21, 14.29s/it]Train:  55%|█████▍    | 2732/5000 [10:53:30<9:00:27, 14.30s/it]Train:  55%|█████▍    | 2733/5000 [10:53:44<8:59:39, 14.28s/it]Train:  55%|█████▍    | 2734/5000 [10:53:58<8:59:28, 14.28s/it]Train:  55%|█████▍    | 2735/5000 [10:54:13<8:58:51, 14.27s/it]Train:  55%|█████▍    | 2736/5000 [10:54:27<8:58:51, 14.28s/it]Train:  55%|█████▍    | 2737/5000 [10:54:41<8:58:33, 14.28s/it]Train:  55%|█████▍    | 2738/5000 [10:54:56<8:58:57, 14.30s/it]Train:  55%|█████▍    | 2739/5000 [10:55:10<8:58:05, 14.28s/it]Train:  55%|█████▍    | 2740/5000 [10:55:24<8:57:55, 14.28s/it]                                                               {'loss': 1.67890682, 'token_acc': 0.6349212, 'grad_norm': 0.23516542, 'learning_rate': 9.24e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06965, 'epoch': 1.17, 'global_step/max_steps': '2740/5000', 'percentage': '54.80%', 'elapsed_time': '10h 55m 24s', 'remaining_time': '9h 0m 35s'}
+Train:  55%|█████▍    | 2740/5000 [10:55:24<8:57:55, 14.28s/it]Train:  55%|██��██▍    | 2740/5000 [10:55:24<8:57:55, 14.28s/it]Train:  55%|█████▍    | 2741/5000 [10:55:38<8:57:30, 14.28s/it]Train:  55%|█████▍    | 2742/5000 [10:55:53<8:57:38, 14.29s/it]Train:  55%|█████▍    | 2743/5000 [10:56:07<8:57:17, 14.28s/it]Train:  55%|█████▍    | 2744/5000 [10:56:21<8:57:02, 14.28s/it]Train:  55%|█████▍    | 2745/5000 [10:56:36<8:57:27, 14.30s/it]Train:  55%|█████▍    | 2746/5000 [10:56:50<8:57:04, 14.30s/it]Train:  55%|█████▍    | 2747/5000 [10:57:04<8:57:15, 14.31s/it]Train:  55%|█████▍    | 2748/5000 [10:57:18<8:56:51, 14.30s/it]Train:  55%|█████▍    | 2749/5000 [10:57:33<8:56:12, 14.29s/it]Train:  55%|█████▌    | 2750/5000 [10:57:47<8:55:54, 14.29s/it]                                                               {'loss': 1.65288277, 'token_acc': 0.63289328, 'grad_norm': 0.23927039, 'learning_rate': 9.17e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069651, 'epoch': 1.17, 'global_step/max_steps': '2750/5000', 'percentage': '55.00%', 'elapsed_time': '10h 57m 47s', 'remaining_time': '8h 58m 11s'}
+Train:  55%|█████▌    | 2750/5000 [10:57:47<8:55:54, 14.29s/it]Train:  55%|█████▌    | 2750/5000 [10:57:47<8:55:54, 14.29s/it]Train:  55%|█████▌    | 2751/5000 [10:58:01<8:55:55, 14.30s/it]Train:  55%|█████▌    | 2752/5000 [10:58:16<8:55:12, 14.28s/it]Train:  55%|█████▌    | 2753/5000 [10:58:30<8:55:06, 14.29s/it]Train:  55%|█████▌    | 2754/5000 [10:58:44<8:55:39, 14.31s/it]Train:  55%|█████▌    | 2755/5000 [10:58:59<8:55:18, 14.31s/it]Train:  55%|█████▌    | 2756/5000 [10:59:13<8:55:25, 14.32s/it]Train:  55%|█████▌    | 2757/5000 [10:59:27<8:55:10, 14.32s/it]Train:  55%|█████▌    | 2758/5000 [10:59:42<8:55:04, 14.32s/it]Train:  55%|█████▌    | 2759/5000 [10:59:56<8:54:34, 14.31s/it]Train:  55%|█████▌    | 2760/5000 [11:00:10<8:53:50, 14.30s/it]                                                               {'loss': 1.67466965, 'token_acc': 0.62952033, 'grad_norm': 0.2474547, 'learning_rate': 9.11e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069652, 'epoch': 1.17, 'global_step/max_steps': '2760/5000', 'percentage': '55.20%', 'elapsed_time': '11h 0m 10s', 'remaining_time': '8h 55m 47s'}
+Train:  55%|█████▌    | 2760/5000 [11:00:10<8:53:50, 14.30s/it]Train:  55%|█████▌    | 2760/5000 [11:00:10<8:53:50, 14.30s/it]Train:  55%|█████▌    | 2761/5000 [11:00:24<8:53:30, 14.30s/it]Train:  55%|█████▌    | 2762/5000 [11:00:39<8:53:15, 14.30s/it]Train:  55%|█████▌    | 2763/5000 [11:00:53<8:53:21, 14.31s/it]Train:  55%|█████▌    | 2764/5000 [11:01:07<8:52:45, 14.30s/it]Train:  55%|█████▌    | 2765/5000 [11:01:22<8:52:24, 14.29s/it]Train:  55%|█████▌    | 2766/5000 [11:01:36<8:52:11, 14.29s/it]Train:  55%|█████▌    | 2767/5000 [11:01:50<8:52:20, 14.30s/it]Train:  55%|█████▌    | 2768/5000 [11:02:04<8:51:55, 14.30s/it]Train:  55%|█████▌    | 2769/5000 [11:02:19<8:51:35, 14.30s/it]Train:  55%|█████▌    | 2770/5000 [11:02:33<8:51:21, 14.30s/it]                                                               {'loss': 1.66240864, 'token_acc': 0.63224948, 'grad_norm': 0.24756268, 'learning_rate': 9.04e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069653, 'epoch': 1.17, 'global_step/max_steps': '2770/5000', 'percentage': '55.40%', 'elapsed_time': '11h 2m 33s', 'remaining_time': '8h 53m 23s'}
+Train:  55%|█████▌    | 2770/5000 [11:02:33<8:51:21, 14.30s/it]Train:  55%|█████▌    | 2770/5000 [11:02:33<8:51:21, 14.30s/it]Train:  55%|█████▌    | 2771/5000 [11:02:47<8:50:52, 14.29s/it]Train:  55%|█████▌    | 2772/5000 [11:03:02<8:50:13, 14.28s/it]Train:  55%|█████▌    | 2773/5000 [11:03:16<8:50:22, 14.29s/it]Train:  55%|█████▌    | 2774/5000 [11:03:30<8:50:03, 14.29s/it]Train:  56%|█████▌    | 2775/5000 [11:03:44<8:50:01, 14.29s/it]Train:  56%|█████▌    | 2776/5000 [11:03:59<8:49:58, 14.30s/it]Train:  56%|█████▌    | 2777/5000 [11:04:13<8:49:59, 14.30s/it]Train:  56%|█████▌    | 2778/5000 [11:04:27<8:49:32, 14.30s/it]Train:  56%|█████▌    | 2779/5000 [11:04:42<8:49:35, 14.31s/it]Train:  56%|█████▌    | 2780/5000 [11:04:56<8:48:59, 14.30s/it]                                                               {'loss': 1.69542065, 'token_acc': 0.630636, 'grad_norm': 0.24558194, 'learning_rate': 8.98e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069654, 'epoch': 1.17, 'global_step/max_steps': '2780/5000', 'percentage': '55.60%', 'elapsed_time': '11h 4m 56s', 'remaining_time': '8h 50m 59s'}
+Train:  56%|█████▌    | 2780/5000 [11:04:56<8:48:59, 14.30s/it]Train:  56%|█████▌    | 2780/5000 [11:04:56<8:48:59, 14.30s/it]Train:  56%|█████▌    | 2781/5000 [11:05:10<8:48:28, 14.29s/it]Train:  56%|█████▌    | 2782/5000 [11:05:25<8:48:06, 14.29s/it]Train:  56%|█████▌    | 2783/5000 [11:05:39<8:48:01, 14.29s/it]Train:  56%|█████▌    | 2784/5000 [11:05:53<8:47:34, 14.28s/it]Train:  56%|█████▌    | 2785/5000 [11:06:07<8:47:19, 14.28s/it]Train:  56%|█████▌    | 2786/5000 [11:06:22<8:46:55, 14.28s/it]Train:  56%|█████▌    | 2787/5000 [11:06:36<8:47:05, 14.29s/it]Train:  56%|█████▌    | 2788/5000 [11:06:50<8:46:33, 14.28s/it]Train:  56%|█████▌    | 2789/5000 [11:07:05<8:46:20, 14.28s/it]Train:  56%|█████▌    | 2790/5000 [11:07:19<8:46:17, 14.29s/it]                                                               {'loss': 1.66648769, 'token_acc': 0.63251959, 'grad_norm': 0.23750542, 'learning_rate': 8.91e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069656, 'epoch': 1.18, 'global_step/max_steps': '2790/5000', 'percentage': '55.80%', 'elapsed_time': '11h 7m 19s', 'remaining_time': '8h 48m 35s'}
+Train:  56%|█████▌    | 2790/5000 [11:07:19<8:46:17, 14.29s/it]Train:  56%|█████▌    | 2790/5000 [11:07:19<8:46:17, 14.29s/it]Train:  56%|█████▌    | 2791/5000 [11:07:33<8:45:59, 14.29s/it]Train:  56%|█████▌    | 2792/5000 [11:07:47<8:45:25, 14.28s/it]Train:  56%|█████▌    | 2793/5000 [11:08:02<8:44:57, 14.27s/it]Train:  56%|█████▌    | 2794/5000 [11:08:16<8:44:37, 14.27s/it]Train:  56%|█████▌    | 2795/5000 [11:08:30<8:46:12, 14.32s/it]Train:  56%|█████▌    | 2796/5000 [11:08:45<8:45:02, 14.29s/it]Train:  56%|█████▌    | 2797/5000 [11:08:59<8:44:27, 14.28s/it]Train:  56%|█████▌    | 2798/5000 [11:09:13<8:44:16, 14.29s/it]Train:  56%|█████▌    | 2799/5000 [11:09:28<8:45:26, 14.32s/it]Train:  56%|█████▌    | 2800/5000 [11:09:42<8:44:51, 14.31s/it]                                                               {'loss': 1.66802635, 'token_acc': 0.63398224, 'grad_norm': 0.23846598, 'learning_rate': 8.85e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069657, 'epoch': 1.18, 'global_step/max_steps': '2800/5000', 'percentage': '56.00%', 'elapsed_time': '11h 9m 42s', 'remaining_time': '8h 46m 11s'}
+Train:  56%|█████▌    | 2800/5000 [11:09:42<8:44:51, 14.31s/it]Train:  56%|█████▌    | 2800/5000 [11:09:42<8:44:51, 14.31s/it]Train:  56%|█████▌    | 2801/5000 [11:09:56<8:44:53, 14.32s/it]Train:  56%|█████▌    | 2802/5000 [11:10:10<8:43:54, 14.30s/it]Train:  56%|█████▌    | 2803/5000 [11:10:25<8:45:42, 14.36s/it]Train:  56%|█████▌    | 2804/5000 [11:10:39<8:44:49, 14.34s/it]Train:  56%|█████▌    | 2805/5000 [11:10:53<8:44:03, 14.33s/it]Train:  56%|█████▌    | 2806/5000 [11:11:08<8:43:06, 14.31s/it]Train:  56%|█████▌    | 2807/5000 [11:11:22<8:42:30, 14.30s/it]Train:  56%|█████▌    | 2808/5000 [11:11:36<8:42:21, 14.30s/it]Train:  56%|█████▌    | 2809/5000 [11:11:51<8:41:55, 14.29s/it]Train:  56%|█████▌    | 2810/5000 [11:12:05<8:41:38, 14.29s/it]                                                               {'loss': 1.67056599, 'token_acc': 0.62898592, 'grad_norm': 0.2434881, 'learning_rate': 8.78e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069657, 'epoch': 1.18, 'global_step/max_steps': '2810/5000', 'percentage': '56.20%', 'elapsed_time': '11h 12m 5s', 'remaining_time': '8h 43m 47s'}
+Train:  56%|█████▌    | 2810/5000 [11:12:05<8:41:38, 14.29s/it]Train:  56%|█████▌    | 2810/5000 [11:12:05<8:41:38, 14.29s/it]Train:  56%|█████▌    | 2811/5000 [11:12:19<8:41:28, 14.29s/it]Train:  56%|█████▌    | 2812/5000 [11:12:33<8:41:09, 14.29s/it]Train:  56%|█████▋    | 2813/5000 [11:12:48<8:40:34, 14.28s/it]Train:  56%|█████▋    | 2814/5000 [11:13:02<8:40:11, 14.28s/it]Train:  56%|█████▋    | 2815/5000 [11:13:16<8:40:15, 14.29s/it]Train:  56%|█████▋    | 2816/5000 [11:13:31<8:39:57, 14.28s/it]Train:  56%|█████▋    | 2817/5000 [11:13:45<8:39:47, 14.29s/it]Train:  56%|█████▋    | 2818/5000 [11:13:59<8:39:05, 14.27s/it]Train:  56%|█████▋    | 2819/5000 [11:14:13<8:39:12, 14.28s/it]Train:  56%|█████▋    | 2820/5000 [11:14:28<8:39:34, 14.30s/it]                                                               {'loss': 1.67972221, 'token_acc': 0.62908491, 'grad_norm': 0.23583564, 'learning_rate': 8.71e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069659, 'epoch': 1.18, 'global_step/max_steps': '2820/5000', 'percentage': '56.40%', 'elapsed_time': '11h 14m 28s', 'remaining_time': '8h 41m 23s'}
+Train:  56%|█████▋    | 2820/5000 [11:14:28<8:39:34, 14.30s/it]Train:  56%|█████▋    | 2820/5000 [11:14:28<8:39:34, 14.30s/it]Train:  56%|█████▋    | 2821/5000 [11:14:42<8:39:21, 14.30s/it]Train:  56%|█████▋    | 2822/5000 [11:14:56<8:38:49, 14.29s/it]Train:  56%|█████▋    | 2823/5000 [11:15:11<8:38:30, 14.29s/it]Train:  56%|█████▋    | 2824/5000 [11:15:25<8:37:54, 14.28s/it]Train:  56%|█████▋    | 2825/5000 [11:15:39<8:38:03, 14.29s/it]Train:  57%|█████▋    | 2826/5000 [11:15:54<8:38:27, 14.31s/it]Train:  57%|█████▋    | 2827/5000 [11:16:08<8:37:49, 14.30s/it]Train:  57%|█████▋    | 2828/5000 [11:16:22<8:37:14, 14.29s/it]Train:  57%|█████▋    | 2829/5000 [11:16:36<8:37:20, 14.30s/it]Train:  57%|█████▋    | 2830/5000 [11:16:51<8:36:49, 14.29s/it]                                                               {'loss': 1.67147579, 'token_acc': 0.63727893, 'grad_norm': 0.24144328, 'learning_rate': 8.65e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06966, 'epoch': 1.18, 'global_step/max_steps': '2830/5000', 'percentage': '56.60%', 'elapsed_time': '11h 16m 51s', 'remaining_time': '8h 39m 0s'}
+Train:  57%|█████▋    | 2830/5000 [11:16:51<8:36:49, 14.29s/it]Train:  57%|█████▋    | 2830/5000 [11:16:51<8:36:49, 14.29s/it]Train:  57%|█████▋    | 2831/5000 [11:17:05<8:36:34, 14.29s/it]Train:  57%|█████▋    | 2832/5000 [11:17:19<8:36:01, 14.28s/it]Train:  57%|█████▋    | 2833/5000 [11:17:34<8:35:53, 14.28s/it]Train:  57%|█████▋    | 2834/5000 [11:17:48<8:35:45, 14.29s/it]Train:  57%|█████▋    | 2835/5000 [11:18:02<8:35:27, 14.29s/it]Train:  57%|█████▋    | 2836/5000 [11:18:16<8:35:11, 14.28s/it]Train:  57%|█████▋    | 2837/5000 [11:18:31<8:34:32, 14.27s/it]Train:  57%|█████▋    | 2838/5000 [11:18:45<8:34:41, 14.28s/it]Train:  57%|█████▋    | 2839/5000 [11:18:59<8:34:16, 14.28s/it]Train:  57%|█████▋    | 2840/5000 [11:19:14<8:34:13, 14.28s/it]                                                               {'loss': 1.66916656, 'token_acc': 0.63667501, 'grad_norm': 0.23799285, 'learning_rate': 8.58e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069661, 'epoch': 1.19, 'global_step/max_steps': '2840/5000', 'percentage': '56.80%', 'elapsed_time': '11h 19m 14s', 'remaining_time': '8h 36m 36s'}
+Train:  57%|█████▋    | 2840/5000 [11:19:14<8:34:13, 14.28s/it]Train:  57%|█████▋    | 2840/5000 [11:19:14<8:34:13, 14.28s/it]Train:  57%|█████▋    | 2841/5000 [11:19:28<8:34:17, 14.29s/it]Train:  57%|█████▋    | 2842/5000 [11:19:42<8:34:00, 14.29s/it]Train:  57%|█████▋    | 2843/5000 [11:19:56<8:33:45, 14.29s/it]Train:  57%|█████▋    | 2844/5000 [11:20:11<8:33:53, 14.30s/it]Train:  57%|█████▋    | 2845/5000 [11:20:25<8:33:12, 14.29s/it]Train:  57%|█████▋    | 2846/5000 [11:20:39<8:33:22, 14.30s/it]Train:  57%|█████▋    | 2847/5000 [11:20:54<8:33:36, 14.31s/it]Train:  57%|█████▋    | 2848/5000 [11:21:08<8:33:04, 14.30s/it]Train:  57%|█████▋    | 2849/5000 [11:21:22<8:32:44, 14.30s/it]Train:  57%|█████▋    | 2850/5000 [11:21:37<8:32:28, 14.30s/it]                                                               {'loss': 1.66229935, 'token_acc': 0.63122656, 'grad_norm': 0.24291697, 'learning_rate': 8.52e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069662, 'epoch': 1.19, 'global_step/max_steps': '2850/5000', 'percentage': '57.00%', 'elapsed_time': '11h 21m 37s', 'remaining_time': '8h 34m 12s'}
+Train:  57%|█████▋    | 2850/5000 [11:21:37<8:32:28, 14.30s/it]Train:  57%|█████▋    | 2850/5000 [11:21:37<8:32:28, 14.30s/it]Train:  57%|█████▋    | 2851/5000 [11:21:51<8:32:02, 14.30s/it]Train:  57%|█████▋    | 2852/5000 [11:22:05<8:31:58, 14.30s/it]Train:  57%|█████▋    | 2853/5000 [11:22:19<8:32:15, 14.32s/it]Train:  57%|█████▋    | 2854/5000 [11:22:34<8:31:33, 14.30s/it]Train:  57%|█████▋    | 2855/5000 [11:22:48<8:30:47, 14.29s/it]Train:  57%|█████▋    | 2856/5000 [11:23:02<8:30:21, 14.28s/it]Train:  57%|█████▋    | 2857/5000 [11:23:17<8:30:40, 14.30s/it]Train:  57%|█████▋    | 2858/5000 [11:23:31<8:30:30, 14.30s/it]Train:  57%|█████▋    | 2859/5000 [11:23:45<8:30:14, 14.30s/it]Train:  57%|█████▋    | 2860/5000 [11:23:59<8:29:50, 14.29s/it]                                                               {'loss': 1.65905457, 'token_acc': 0.6305013, 'grad_norm': 0.23509276, 'learning_rate': 8.45e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069663, 'epoch': 1.19, 'global_step/max_steps': '2860/5000', 'percentage': '57.20%', 'elapsed_time': '11h 23m 59s', 'remaining_time': '8h 31m 48s'}
+Train:  57%|█████▋    | 2860/5000 [11:23:59<8:29:50, 14.29s/it]Train:  57%|█████▋    | 2860/5000 [11:23:59<8:29:50, 14.29s/it]Train:  57%|█████▋    | 2861/5000 [11:24:14<8:29:09, 14.28s/it]Train:  57%|█████▋    | 2862/5000 [11:24:28<8:28:33, 14.27s/it]Train:  57%|█████▋    | 2863/5000 [11:24:42<8:28:04, 14.27s/it]Train:  57%|█████▋    | 2864/5000 [11:24:57<8:28:19, 14.28s/it]Train:  57%|█████▋    | 2865/5000 [11:25:11<8:28:06, 14.28s/it]Train:  57%|█████▋    | 2866/5000 [11:25:25<8:27:28, 14.27s/it]Train:  57%|█████▋    | 2867/5000 [11:25:39<8:27:21, 14.27s/it]Train:  57%|█████▋    | 2868/5000 [11:25:54<8:27:26, 14.28s/it]Train:  57%|█████▋    | 2869/5000 [11:26:08<8:27:17, 14.28s/it]Train:  57%|█████▋    | 2870/5000 [11:26:22<8:27:27, 14.29s/it]                                                               {'loss': 1.66471786, 'token_acc': 0.63528425, 'grad_norm': 0.23770818, 'learning_rate': 8.39e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069664, 'epoch': 1.19, 'global_step/max_steps': '2870/5000', 'percentage': '57.40%', 'elapsed_time': '11h 26m 22s', 'remaining_time': '8h 29m 24s'}
+Train:  57%|█████▋    | 2870/5000 [11:26:22<8:27:27, 14.29s/it]Train:  57%|█████▋    | 2870/5000 [11:26:22<8:27:27, 14.29s/it]Train:  57%|█████▋    | 2871/5000 [11:26:37<8:26:46, 14.28s/it]Train:  57%|█████▋    | 2872/5000 [11:26:51<8:26:25, 14.28s/it]Train:  57%|█████▋    | 2873/5000 [11:27:05<8:26:26, 14.29s/it]Train:  57%|█████▋    | 2874/5000 [11:27:19<8:25:53, 14.28s/it]Train:  57%|█████▊    | 2875/5000 [11:27:34<8:25:55, 14.28s/it]Train:  58%|█████▊    | 2876/5000 [11:27:48<8:25:10, 14.27s/it]Train:  58%|█████▊    | 2877/5000 [11:28:02<8:24:58, 14.27s/it]Train:  58%|█████▊    | 2878/5000 [11:28:16<8:24:30, 14.27s/it]Train:  58%|█████▊    | 2879/5000 [11:28:31<8:24:27, 14.27s/it]Train:  58%|█████▊    | 2880/5000 [11:28:45<8:24:16, 14.27s/it]                                                               {'loss': 1.67566223, 'token_acc': 0.63039713, 'grad_norm': 0.23172919, 'learning_rate': 8.32e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069665, 'epoch': 1.19, 'global_step/max_steps': '2880/5000', 'percentage': '57.60%', 'elapsed_time': '11h 28m 45s', 'remaining_time': '8h 27m 0s'}
+Train:  58%|█████▊    | 2880/5000 [11:28:45<8:24:16, 14.27s/it]Train:  58%|█████▊    | 2880/5000 [11:28:45<8:24:16, 14.27s/it]Train:  58%|█████▊    | 2881/5000 [11:28:59<8:23:45, 14.26s/it]Train:  58%|█████▊    | 2882/5000 [11:29:13<8:23:36, 14.27s/it]Train:  58%|█████▊    | 2883/5000 [11:29:28<8:23:29, 14.27s/it]Train:  58%|█████▊    | 2884/5000 [11:29:42<8:23:23, 14.27s/it]Train:  58%|█████▊    | 2885/5000 [11:29:56<8:23:34, 14.29s/it]Train:  58%|█████▊    | 2886/5000 [11:30:11<8:23:16, 14.28s/it]Train:  58%|█████▊    | 2887/5000 [11:30:25<8:22:52, 14.28s/it]Train:  58%|█████▊    | 2888/5000 [11:30:39<8:22:39, 14.28s/it]Train:  58%|█████▊    | 2889/5000 [11:30:54<8:23:03, 14.30s/it]Train:  58%|█████▊    | 2890/5000 [11:31:08<8:23:05, 14.31s/it]                                                               {'loss': 1.67859917, 'token_acc': 0.63604011, 'grad_norm': 0.23881349, 'learning_rate': 8.26e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069667, 'epoch': 1.2, 'global_step/max_steps': '2890/5000', 'percentage': '57.80%', 'elapsed_time': '11h 31m 8s', 'remaining_time': '8h 24m 36s'}
+Train:  58%|█████▊    | 2890/5000 [11:31:08<8:23:05, 14.31s/it]Train:  58%|█████▊    | 2890/5000 [11:31:08<8:23:05, 14.31s/it]Train:  58%|█████▊    | 2891/5000 [11:31:22<8:22:58, 14.31s/it]Train:  58%|█████▊    | 2892/5000 [11:31:36<8:22:48, 14.31s/it]Train:  58%|█████▊    | 2893/5000 [11:31:51<8:22:23, 14.31s/it]Train:  58%|█████▊    | 2894/5000 [11:32:05<8:22:15, 14.31s/it]Train:  58%|█████▊    | 2895/5000 [11:32:19<8:21:51, 14.30s/it]Train:  58%|█████▊    | 2896/5000 [11:32:34<8:21:01, 14.29s/it]Train:  58%|█████▊    | 2897/5000 [11:32:48<8:20:50, 14.29s/it]Train:  58%|█████▊    | 2898/5000 [11:33:02<8:20:25, 14.28s/it]Train:  58%|█████▊    | 2899/5000 [11:33:17<8:20:35, 14.30s/it]Train:  58%|█████▊    | 2900/5000 [11:33:31<8:19:44, 14.28s/it]                                                               {'loss': 1.67206039, 'token_acc': 0.63126369, 'grad_norm': 0.23657645, 'learning_rate': 8.19e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069668, 'epoch': 1.2, 'global_step/max_steps': '2900/5000', 'percentage': '58.00%', 'elapsed_time': '11h 33m 31s', 'remaining_time': '8h 22m 12s'}
+Train:  58%|█████▊    | 2900/5000 [11:33:31<8:19:44, 14.28s/it]Train:  58%|█████▊    | 2900/5000 [11:33:31<8:19:44, 14.28s/it]Train:  58%|█████▊    | 2901/5000 [11:33:45<8:19:11, 14.27s/it]Train:  58%|���████▊    | 2902/5000 [11:33:59<8:19:35, 14.29s/it]Train:  58%|█████▊    | 2903/5000 [11:34:14<8:19:18, 14.29s/it]Train:  58%|█████▊    | 2904/5000 [11:34:28<8:19:30, 14.30s/it]Train:  58%|█████▊    | 2905/5000 [11:34:42<8:18:39, 14.28s/it]Train:  58%|█████▊    | 2906/5000 [11:34:56<8:18:23, 14.28s/it]Train:  58%|█████▊    | 2907/5000 [11:35:11<8:18:40, 14.30s/it]Train:  58%|█████▊    | 2908/5000 [11:35:25<8:18:06, 14.29s/it]Train:  58%|█████▊    | 2909/5000 [11:35:39<8:17:40, 14.28s/it]Train:  58%|█████▊    | 2910/5000 [11:35:54<8:17:27, 14.28s/it]                                                               {'loss': 1.66177635, 'token_acc': 0.6344662, 'grad_norm': 0.23036115, 'learning_rate': 8.13e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069669, 'epoch': 1.2, 'global_step/max_steps': '2910/5000', 'percentage': '58.20%', 'elapsed_time': '11h 35m 54s', 'remaining_time': '8h 19m 48s'}
+Train:  58%|█████▊    | 2910/5000 [11:35:54<8:17:27, 14.28s/it]Train:  58%|█████▊    | 2910/5000 [11:35:54<8:17:27, 14.28s/it]Train:  58%|█████▊    | 2911/5000 [11:36:08<8:16:55, 14.27s/it]Train:  58%|█████▊    | 2912/5000 [11:36:22<8:16:58, 14.28s/it]Train:  58%|█████▊    | 2913/5000 [11:36:36<8:16:26, 14.27s/it]Train:  58%|█████▊    | 2914/5000 [11:36:51<8:16:35, 14.28s/it]Train:  58%|█████▊    | 2915/5000 [11:37:05<8:16:14, 14.28s/it]Train:  58%|█████▊    | 2916/5000 [11:37:19<8:16:18, 14.29s/it]Train:  58%|█████▊    | 2917/5000 [11:37:34<8:15:56, 14.29s/it]Train:  58%|█████▊    | 2918/5000 [11:37:48<8:15:44, 14.29s/it]Train:  58%|█████▊    | 2919/5000 [11:38:02<8:15:13, 14.28s/it]Train:  58%|█████▊    | 2920/5000 [11:38:16<8:15:28, 14.29s/it]                                                               {'loss': 1.65175838, 'token_acc': 0.64000414, 'grad_norm': 0.23774607, 'learning_rate': 8.06e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06967, 'epoch': 1.2, 'global_step/max_steps': '2920/5000', 'percentage': '58.40%', 'elapsed_time': '11h 38m 16s', 'remaining_time': '8h 17m 24s'}
+Train:  58%|█████▊    | 2920/5000 [11:38:16<8:15:28, 14.29s/it]Train:  58%|█████▊    | 2920/5000 [11:38:16<8:15:28, 14.29s/it]Train:  58%|█████▊    | 2921/5000 [11:38:31<8:15:13, 14.29s/it]Train:  58%|█████▊    | 2922/5000 [11:38:45<8:15:02, 14.29s/it]Train:  58%|█████▊    | 2923/5000 [11:38:59<8:14:50, 14.29s/it]Train:  58%|█████▊    | 2924/5000 [11:39:14<8:14:32, 14.29s/it]Train:  58%|█████▊    | 2925/5000 [11:39:28<8:14:18, 14.29s/it]Train:  59%|█████▊    | 2926/5000 [11:39:42<8:13:49, 14.29s/it]Train:  59%|█████▊    | 2927/5000 [11:39:57<8:13:51, 14.29s/it]Train:  59%|█████▊    | 2928/5000 [11:40:11<8:13:25, 14.29s/it]Train:  59%|█████▊    | 2929/5000 [11:40:25<8:12:56, 14.28s/it]Train:  59%|█████▊    | 2930/5000 [11:40:39<8:12:46, 14.28s/it]                                                               {'loss': 1.66648483, 'token_acc': 0.63862706, 'grad_norm': 0.23354287, 'learning_rate': 8e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069671, 'epoch': 1.2, 'global_step/max_steps': '2930/5000', 'percentage': '58.60%', 'elapsed_time': '11h 40m 39s', 'remaining_time': '8h 15m 0s'}
+Train:  59%|█████▊    | 2930/5000 [11:40:39<8:12:46, 14.28s/it]Train:  59%|█████▊    | 2930/5000 [11:40:39<8:12:46, 14.28s/it]Train:  59%|█████▊    | 2931/5000 [11:40:54<8:12:25, 14.28s/it]Train:  59%|█████▊    | 2932/5000 [11:41:08<8:12:14, 14.28s/it]Train:  59%|█████▊    | 2933/5000 [11:41:22<8:11:17, 14.26s/it]Train:  59%|█████▊    | 2934/5000 [11:41:36<8:10:50, 14.25s/it]Train:  59%|█████▊    | 2935/5000 [11:41:51<8:10:56, 14.26s/it]Train:  59%|█████▊    | 2936/5000 [11:42:05<8:10:35, 14.26s/it]Train:  59%|█████▊    | 2937/5000 [11:42:19<8:10:51, 14.28s/it]Train:  59%|█████▉    | 2938/5000 [11:42:33<8:10:36, 14.28s/it]Train:  59%|█████▉    | 2939/5000 [11:42:48<8:10:53, 14.29s/it]Train:  59%|█████▉    | 2940/5000 [11:43:02<8:10:04, 14.27s/it]                                                               {'loss': 1.66140995, 'token_acc': 0.63581006, 'grad_norm': 0.25146729, 'learning_rate': 7.93e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069672, 'epoch': 1.21, 'global_step/max_steps': '2940/5000', 'percentage': '58.80%', 'elapsed_time': '11h 43m 2s', 'remaining_time': '8h 12m 36s'}
+Train:  59%|█████▉    | 2940/5000 [11:43:02<8:10:04, 14.27s/it]Train:  59%|█████▉    | 2940/5000 [11:43:02<8:10:04, 14.27s/it]Train:  59%|█████▉    | 2941/5000 [11:43:16<8:09:56, 14.28s/it]Train:  59%|█████▉    | 2942/5000 [11:43:31<8:10:00, 14.29s/it]Train:  59%|█████▉    | 2943/5000 [11:43:45<8:10:08, 14.30s/it]Train:  59%|█████▉    | 2944/5000 [11:43:59<8:09:26, 14.28s/it]Train:  59%|█████▉    | 2945/5000 [11:44:14<8:09:10, 14.28s/it]Train:  59%|█████▉    | 2946/5000 [11:44:28<8:08:33, 14.27s/it]Train:  59%|█████▉    | 2947/5000 [11:44:42<8:08:43, 14.28s/it]Train:  59%|█████▉    | 2948/5000 [11:44:56<8:08:25, 14.28s/it]Train:  59%|█████▉    | 2949/5000 [11:45:11<8:07:50, 14.27s/it]Train:  59%|█████▉    | 2950/5000 [11:45:25<8:08:03, 14.28s/it]                                                               {'loss': 1.66982727, 'token_acc': 0.63418251, 'grad_norm': 0.23735021, 'learning_rate': 7.87e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069674, 'epoch': 1.21, 'global_step/max_steps': '2950/5000', 'percentage': '59.00%', 'elapsed_time': '11h 45m 25s', 'remaining_time': '8h 10m 12s'}
+Train:  59%|█████▉    | 2950/5000 [11:45:25<8:08:03, 14.28s/it]Train:  59%|█████▉    | 2950/5000 [11:45:25<8:08:03, 14.28s/it]Train:  59%|█████▉    | 2951/5000 [11:45:39<8:07:25, 14.27s/it]Train:  59%|█████▉    | 2952/5000 [11:45:53<8:06:59, 14.27s/it]Train:  59%|█████▉    | 2953/5000 [11:46:08<8:07:08, 14.28s/it]Train:  59%|█████▉    | 2954/5000 [11:46:22<8:07:15, 14.29s/it]Train:  59%|█████▉    | 2955/5000 [11:46:36<8:07:11, 14.29s/it]Train:  59%|█████▉    | 2956/5000 [11:46:51<8:06:20, 14.28s/it]Train:  59%|█████▉    | 2957/5000 [11:47:05<8:05:52, 14.27s/it]Train:  59%|█████▉    | 2958/5000 [11:47:19<8:05:21, 14.26s/it]Train:  59%|█████▉    | 2959/5000 [11:47:33<8:05:03, 14.26s/it]Train:  59%|█████▉    | 2960/5000 [11:47:48<8:05:07, 14.27s/it]                                                               {'loss': 1.67158604, 'token_acc': 0.63237847, 'grad_norm': 1.13633704, 'learning_rate': 7.8e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069675, 'epoch': 1.21, 'global_step/max_steps': '2960/5000', 'percentage': '59.20%', 'elapsed_time': '11h 47m 48s', 'remaining_time': '8h 7m 48s'}
+Train:  59%|█████▉    | 2960/5000 [11:47:48<8:05:07, 14.27s/it]Train:  59%|█████▉    | 2960/5000 [11:47:48<8:05:07, 14.27s/it]Train:  59%|█████▉    | 2961/5000 [11:48:02<8:05:18, 14.28s/it]Train:  59%|█████▉    | 2962/5000 [11:48:16<8:04:54, 14.28s/it]Train:  59%|█████▉    | 2963/5000 [11:48:31<8:05:13, 14.29s/it]Train:  59%|█████▉    | 2964/5000 [11:48:45<8:04:45, 14.29s/it]Train:  59%|█████▉    | 2965/5000 [11:48:59<8:04:15, 14.28s/it]Train:  59%|█████▉    | 2966/5000 [11:49:13<8:04:19, 14.29s/it]Train:  59%|█████▉    | 2967/5000 [11:49:28<8:03:48, 14.28s/it]Train:  59%|█████▉    | 2968/5000 [11:49:42<8:03:44, 14.28s/it]Train:  59%|█████▉    | 2969/5000 [11:49:56<8:03:21, 14.28s/it]Train:  59%|█████▉    | 2970/5000 [11:50:10<8:02:51, 14.27s/it]                                                               {'loss': 1.669557, 'token_acc': 0.62904193, 'grad_norm': 0.23894365, 'learning_rate': 7.74e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069676, 'epoch': 1.21, 'global_step/max_steps': '2970/5000', 'percentage': '59.40%', 'elapsed_time': '11h 50m 10s', 'remaining_time': '8h 5m 24s'}
+Train:  59%|█████▉    | 2970/5000 [11:50:10<8:02:51, 14.27s/it]Train:  59%|█████▉    | 2970/5000 [11:50:10<8:02:51, 14.27s/it]Train:  59%|█████▉    | 2971/5000 [11:50:25<8:02:29, 14.27s/it]Train:  59%|█████▉    | 2972/5000 [11:50:39<8:02:35, 14.28s/it]Train:  59%|█████▉    | 2973/5000 [11:50:53<8:02:10, 14.27s/it]Train:  59%|█████▉    | 2974/5000 [11:51:08<8:02:53, 14.30s/it]Train:  60%|█████▉    | 2975/5000 [11:51:22<8:02:21, 14.29s/it]Train:  60%|█████▉    | 2976/5000 [11:51:36<8:02:22, 14.30s/it]Train:  60%|█████▉    | 2977/5000 [11:51:51<8:02:36, 14.31s/it]Train:  60%|█████▉    | 2978/5000 [11:52:05<8:01:45, 14.30s/it]Train:  60%|█████▉    | 2979/5000 [11:52:19<8:01:34, 14.30s/it]Train:  60%|█████▉    | 2980/5000 [11:52:33<8:01:12, 14.29s/it]                                                               {'loss': 1.66899223, 'token_acc': 0.63762426, 'grad_norm': 0.25034222, 'learning_rate': 7.67e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069677, 'epoch': 1.21, 'global_step/max_steps': '2980/5000', 'percentage': '59.60%', 'elapsed_time': '11h 52m 33s', 'remaining_time': '8h 3m 0s'}
+Train:  60%|█████▉    | 2980/5000 [11:52:33<8:01:12, 14.29s/it]Train:  60%|█████▉    | 2980/5000 [11:52:33<8:01:12, 14.29s/it]Train:  60%|█████▉    | 2981/5000 [11:52:48<8:00:33, 14.28s/it]Train:  60%|█████▉    | 2982/5000 [11:53:02<8:00:28, 14.29s/it]Train:  60%|█████▉    | 2983/5000 [11:53:16<7:59:39, 14.27s/it]Train:  60%|█████▉    | 2984/5000 [11:53:30<7:59:43, 14.28s/it]Train:  60%|█████▉    | 2985/5000 [11:53:45<7:59:30, 14.28s/it]Train:  60%|█████▉    | 2986/5000 [11:53:59<7:59:36, 14.29s/it]Train:  60%|█████▉    | 2987/5000 [11:54:13<7:59:27, 14.29s/it]Train:  60%|█████▉    | 2988/5000 [11:54:28<7:59:00, 14.28s/it]Train:  60%|█████▉    | 2989/5000 [11:54:42<7:59:06, 14.29s/it]Train:  60%|█████▉    | 2990/5000 [11:54:56<7:58:34, 14.29s/it]                                                               {'loss': 1.65002861, 'token_acc': 0.63502922, 'grad_norm': 0.23804264, 'learning_rate': 7.61e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069678, 'epoch': 1.22, 'global_step/max_steps': '2990/5000', 'percentage': '59.80%', 'elapsed_time': '11h 54m 56s', 'remaining_time': '8h 0m 36s'}
+Train:  60%|█████▉    | 2990/5000 [11:54:56<7:58:34, 14.29s/it]Train:  60%|█████▉    | 2990/5000 [11:54:56<7:58:34, 14.29s/it]Train:  60%|█████▉    | 2991/5000 [11:55:11<7:58:33, 14.29s/it]Train:  60%|█████▉    | 2992/5000 [11:55:25<7:58:03, 14.28s/it]Train:  60%|█████▉    | 2993/5000 [11:55:39<7:57:39, 14.28s/it]Train:  60%|█████▉    | 2994/5000 [11:55:53<7:57:37, 14.29s/it]Train:  60%|█████▉    | 2995/5000 [11:56:08<7:57:14, 14.28s/it]Train:  60%|█████▉    | 2996/5000 [11:56:22<7:56:52, 14.28s/it]Train:  60%|█████▉    | 2997/5000 [11:56:36<7:56:31, 14.27s/it]Train:  60%|█████▉    | 2998/5000 [11:56:50<7:56:24, 14.28s/it]Train:  60%|█████▉    | 2999/5000 [11:57:05<7:55:57, 14.27s/it]Train:  60%|██████    | 3000/5000 [11:57:19<7:55:31, 14.27s/it]                                                               {'loss': 1.6705368, 'token_acc': 0.62977341, 'grad_norm': 0.2340755, 'learning_rate': 7.55e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069679, 'epoch': 1.22, 'global_step/max_steps': '3000/5000', 'percentage': '60.00%', 'elapsed_time': '11h 57m 19s', 'remaining_time': '7h 58m 12s'}
+Train:  60%|██████    | 3000/5000 [11:57:19<7:55:31, 14.27s/it]Train:  60%|██████    | 3000/5000 [11:57:19<7:55:31, 14.27s/it]                                                               {'eval_loss': 1.45585132, 'eval_token_acc': 0.66392005, 'eval_runtime': 42.3669, 'eval_samples_per_second': 0.33, 'eval_steps_per_second': 0.024, 'epoch': 1.22, 'global_step/max_steps': '3000/5000', 'percentage': '60.00%', 'elapsed_time': '11h 58m 1s', 'remaining_time': '7h 58m 41s'}
+Train:  60%|██████    | 3000/5000 [11:58:01<7:55:31, 14.27s/it]Train:  60%|██████    | 3000/5000 [11:58:01<7:55:31, 14.27s/it][INFO:swift] Saving model checkpoint to /mnt/nvme1/luoyingfeng/llm-mt/exps/Qwen3-4B-Base/cpt_10lang_mono/0.5B/checkpoint-3000
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/storage.py:414: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  return torch.load(io.BytesIO(b))
+/mnt/nvme1/luoyingfeng/h200_ms/lib/python3.10/site-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+Train:  60%|██████    | 3001/5000 [11:59:02<22:44:23, 40.95s/it]Train:  60%|██████    | 3002/5000 [11:59:16<18:16:52, 32.94s/it]Train:  60%|██████    | 3003/5000 [11:59:31<15:10:25, 27.35s/it]Train:  60%|██████    | 3004/5000 [11:59:45<12:59:38, 23.44s/it]Train:  60%|██████    | 3005/5000 [11:59:59<11:28:28, 20.71s/it]Train:  60%|██████    | 3006/5000 [12:00:14<10:24:36, 18.79s/it]Train:  60%|██████    | 3007/5000 [12:00:28<9:39:41, 17.45s/it] Train:  60%|██████    | 3008/5000 [12:00:42<9:08:28, 16.52s/it]Train:  60%|██████    | 3009/5000 [12:00:57<8:45:59, 15.85s/it]Train:  60%|██████    | 3010/5000 [12:01:11<8:30:18, 15.39s/it]                                                               {'loss': 1.66333122, 'token_acc': 0.63081747, 'grad_norm': 0.23705311, 'learning_rate': 7.48e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069537, 'epoch': 1.22, 'global_step/max_steps': '3010/5000', 'percentage': '60.20%', 'elapsed_time': '12h 1m 11s', 'remaining_time': '7h 56m 48s'}
+Train:  60%|██████    | 3010/5000 [12:01:11<8:30:18, 15.39s/it]Train:  60%|██████    | 3010/5000 [12:01:11<8:30:18, 15.39s/it]Train:  60%|██████    | 3011/5000 [12:01:25<8:20:05, 15.09s/it]Train:  60%|██████    | 3012/5000 [12:01:40<8:11:53, 14.85s/it]Train:  60%|██████    | 3013/5000 [12:01:54<8:06:51, 14.70s/it]Train:  60%|██████    | 3014/5000 [12:02:08<8:02:44, 14.58s/it]Train:  60%|██████    | 3015/5000 [12:02:23<7:59:31, 14.49s/it]Train:  60%|██████    | 3016/5000 [12:02:37<7:57:30, 14.44s/it]Train:  60%|██████    | 3017/5000 [12:02:51<7:56:13, 14.41s/it]Train:  60%|██████    | 3018/5000 [12:03:06<7:55:34, 14.40s/it]Train:  60%|██████    | 3019/5000 [12:03:20<7:54:22, 14.37s/it]Train:  60%|██████    | 3020/5000 [12:03:34<7:53:31, 14.35s/it]                                                               {'loss': 1.66398773, 'token_acc': 0.63559524, 'grad_norm': 0.24290243, 'learning_rate': 7.42e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069538, 'epoch': 1.22, 'global_step/max_steps': '3020/5000', 'percentage': '60.40%', 'elapsed_time': '12h 3m 34s', 'remaining_time': '7h 54m 23s'}
+Train:  60%|██████    | 3020/5000 [12:03:34<7:53:31, 14.35s/it]Train:  60%|██████    | 3020/5000 [12:03:34<7:53:31, 14.35s/it]Train:  60%|██████    | 3021/5000 [12:03:49<7:53:01, 14.34s/it]Train:  60%|██████    | 3022/5000 [12:04:03<7:52:15, 14.33s/it]Train:  60%|██████    | 3023/5000 [12:04:17<7:51:47, 14.32s/it]Train:  60%|██████    | 3024/5000 [12:04:31<7:51:25, 14.31s/it]Train:  60%|██████    | 3025/5000 [12:04:46<7:50:48, 14.30s/it]Train:  61%|██████    | 3026/5000 [12:05:00<7:50:31, 14.30s/it]Train:  61%|██████    | 3027/5000 [12:05:14<7:50:26, 14.31s/it]Train:  61%|██████    | 3028/5000 [12:05:29<7:49:52, 14.30s/it]Train:  61%|██████    | 3029/5000 [12:05:43<7:49:44, 14.30s/it]Train:  61%|██████    | 3030/5000 [12:05:57<7:49:31, 14.30s/it]                                                               {'loss': 1.66387138, 'token_acc': 0.63112725, 'grad_norm': 0.23166157, 'learning_rate': 7.35e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069539, 'epoch': 1.22, 'global_step/max_steps': '3030/5000', 'percentage': '60.60%', 'elapsed_time': '12h 5m 57s', 'remaining_time': '7h 51m 59s'}
+Train:  61%|██████    | 3030/5000 [12:05:57<7:49:31, 14.30s/it]Train:  61%|██████    | 3030/5000 [12:05:57<7:49:31, 14.30s/it]Train:  61%|██████    | 3031/5000 [12:06:12<7:49:31, 14.31s/it]Train:  61%|██████    | 3032/5000 [12:06:26<7:49:11, 14.30s/it]Train:  61%|██████    | 3033/5000 [12:06:40<7:48:38, 14.30s/it]Train:  61%|██████    | 3034/5000 [12:06:54<7:48:46, 14.31s/it]Train:  61%|██████    | 3035/5000 [12:07:09<7:48:27, 14.30s/it]Train:  61%|██████    | 3036/5000 [12:07:23<7:48:00, 14.30s/it]Train:  61%|██████    | 3037/5000 [12:07:37<7:47:53, 14.30s/it]Train:  61%|██████    | 3038/5000 [12:07:52<7:47:20, 14.29s/it]Train:  61%|██████    | 3039/5000 [12:08:06<7:47:17, 14.30s/it]Train:  61%|██████    | 3040/5000 [12:08:20<7:46:47, 14.29s/it]                                                               {'loss': 1.66447525, 'token_acc': 0.63289821, 'grad_norm': 0.24270087, 'learning_rate': 7.29e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06954, 'epoch': 1.23, 'global_step/max_steps': '3040/5000', 'percentage': '60.80%', 'elapsed_time': '12h 8m 20s', 'remaining_time': '7h 49m 35s'}
+Train:  61%|██████    | 3040/5000 [12:08:20<7:46:47, 14.29s/it]Train:  61%|██████    | 3040/5000 [12:08:20<7:46:47, 14.29s/it]Train:  61%|██████    | 3041/5000 [12:08:34<7:46:40, 14.29s/it]Train:  61%|██████    | 3042/5000 [12:08:49<7:46:36, 14.30s/it]Train:  61%|██████    | 3043/5000 [12:09:03<7:46:09, 14.29s/it]Train:  61%|██████    | 3044/5000 [12:09:17<7:45:50, 14.29s/it]Train:  61%|██████    | 3045/5000 [12:09:32<7:45:50, 14.30s/it]Train:  61%|██████    | 3046/5000 [12:09:46<7:45:27, 14.29s/it]Train:  61%|██████    | 3047/5000 [12:10:00<7:44:48, 14.28s/it]Train:  61%|██████    | 3048/5000 [12:10:14<7:44:14, 14.27s/it]Train:  61%|██████    | 3049/5000 [12:10:29<7:44:03, 14.27s/it]Train:  61%|██████    | 3050/5000 [12:10:43<7:43:43, 14.27s/it]                                                               {'loss': 1.65983696, 'token_acc': 0.63211879, 'grad_norm': 0.24143377, 'learning_rate': 7.23e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069542, 'epoch': 1.23, 'global_step/max_steps': '3050/5000', 'percentage': '61.00%', 'elapsed_time': '12h 10m 43s', 'remaining_time': '7h 47m 11s'}
+Train:  61%|██████    | 3050/5000 [12:10:43<7:43:43, 14.27s/it]Train:  61%|██████    | 3050/5000 [12:10:43<7:43:43, 14.27s/it]Train:  61%|██████    | 3051/5000 [12:10:57<7:43:29, 14.27s/it]Train:  61%|██████    | 3052/5000 [12:11:12<7:43:19, 14.27s/it]Train:  61%|██████    | 3053/5000 [12:11:26<7:42:50, 14.26s/it]Train:  61%|██████    | 3054/5000 [12:11:40<7:42:39, 14.26s/it]Train:  61%|██████    | 3055/5000 [12:11:54<7:42:11, 14.26s/it]Train:  61%|██████    | 3056/5000 [12:12:08<7:41:35, 14.25s/it]Train:  61%|██████    | 3057/5000 [12:12:23<7:41:36, 14.25s/it]Train:  61%|██████    | 3058/5000 [12:12:37<7:41:34, 14.26s/it]Train:  61%|██████    | 3059/5000 [12:12:51<7:40:50, 14.25s/it]Train:  61%|██████    | 3060/5000 [12:13:06<7:40:42, 14.25s/it]                                                               {'loss': 1.6791172, 'token_acc': 0.62946642, 'grad_norm': 0.24141945, 'learning_rate': 7.16e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069544, 'epoch': 1.23, 'global_step/max_steps': '3060/5000', 'percentage': '61.20%', 'elapsed_time': '12h 13m 6s', 'remaining_time': '7h 44m 46s'}
+Train:  61%|██████    | 3060/5000 [12:13:06<7:40:42, 14.25s/it]Train:  61%|██████    | 3060/5000 [12:13:06<7:40:42, 14.25s/it]Train:  61%|██████    | 3061/5000 [12:13:20<7:40:50, 14.26s/it]Train:  61%|██████    | 3062/5000 [12:13:34<7:40:57, 14.27s/it]Train:  61%|██████▏   | 3063/5000 [12:13:48<7:40:36, 14.27s/it]Train:  61%|██████▏   | 3064/5000 [12:14:03<7:40:19, 14.27s/it]Train:  61%|██████▏   | 3065/5000 [12:14:17<7:40:30, 14.28s/it]Train:  61%|██████▏   | 3066/5000 [12:14:31<7:40:08, 14.28s/it]Train:  61%|██████▏   | 3067/5000 [12:14:45<7:39:43, 14.27s/it]Train:  61%|██████▏   | 3068/5000 [12:15:00<7:39:42, 14.28s/it]Train:  61%|██████▏   | 3069/5000 [12:15:14<7:39:32, 14.28s/it]Train:  61%|██████▏   | 3070/5000 [12:15:28<7:39:35, 14.29s/it]                                                               {'loss': 1.66830826, 'token_acc': 0.63378337, 'grad_norm': 0.23648332, 'learning_rate': 7.1e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069545, 'epoch': 1.23, 'global_step/max_steps': '3070/5000', 'percentage': '61.40%', 'elapsed_time': '12h 15m 28s', 'remaining_time': '7h 42m 22s'}
+Train:  61%|██████▏   | 3070/5000 [12:15:28<7:39:35, 14.29s/it]Train:  61%|██████▏   | 3070/5000 [12:15:28<7:39:35, 14.29s/it]Train:  61%|██████▏   | 3071/5000 [12:15:43<7:39:10, 14.28s/it]Train:  61%|██████▏   | 3072/5000 [12:15:57<7:38:23, 14.27s/it]Train:  61%|██████▏   | 3073/5000 [12:16:11<7:38:11, 14.27s/it]Train:  61%|██████▏   | 3074/5000 [12:16:25<7:37:36, 14.26s/it]Train:  62%|██████▏   | 3075/5000 [12:16:40<7:37:29, 14.26s/it]Train:  62%|██████▏   | 3076/5000 [12:16:54<7:37:50, 14.28s/it]Train:  62%|██████▏   | 3077/5000 [12:17:08<7:37:13, 14.27s/it]Train:  62%|██████▏   | 3078/5000 [12:17:22<7:36:32, 14.25s/it]Train:  62%|██████▏   | 3079/5000 [12:17:37<7:36:02, 14.24s/it]Train:  62%|██████▏   | 3080/5000 [12:17:51<7:36:03, 14.25s/it]                                                               {'loss': 1.67522964, 'token_acc': 0.63427847, 'grad_norm': 0.23262018, 'learning_rate': 7.04e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069547, 'epoch': 1.23, 'global_step/max_steps': '3080/5000', 'percentage': '61.60%', 'elapsed_time': '12h 17m 51s', 'remaining_time': '7h 39m 57s'}
+Train:  62%|██████▏   | 3080/5000 [12:17:51<7:36:03, 14.25s/it]Train:  62%|██████▏   | 3080/5000 [12:17:51<7:36:03, 14.25s/it]Train:  62%|██████▏   | 3081/5000 [12:18:05<7:36:31, 14.27s/it]Train:  62%|██████▏   | 3082/5000 [12:18:19<7:36:15, 14.27s/it]Train:  62%|██████▏   | 3083/5000 [12:18:34<7:36:08, 14.28s/it]Train:  62%|██████▏   | 3084/5000 [12:18:48<7:35:41, 14.27s/it]Train:  62%|██████▏   | 3085/5000 [12:19:02<7:35:39, 14.28s/it]Train:  62%|██████▏   | 3086/5000 [12:19:17<7:35:07, 14.27s/it]Train:  62%|██████▏   | 3087/5000 [12:19:31<7:34:43, 14.26s/it]Train:  62%|██████▏   | 3088/5000 [12:19:45<7:34:51, 14.27s/it]Train:  62%|██████▏   | 3089/5000 [12:19:59<7:34:44, 14.28s/it]Train:  62%|██████▏   | 3090/5000 [12:20:14<7:34:17, 14.27s/it]                                                               {'loss': 1.66489906, 'token_acc': 0.6356825, 'grad_norm': 0.23703387, 'learning_rate': 6.97e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069549, 'epoch': 1.24, 'global_step/max_steps': '3090/5000', 'percentage': '61.80%', 'elapsed_time': '12h 20m 14s', 'remaining_time': '7h 37m 33s'}
+Train:  62%|██████▏   | 3090/5000 [12:20:14<7:34:17, 14.27s/it]Train:  62%|██████▏   | 3090/5000 [12:20:14<7:34:17, 14.27s/it]Train:  62%|██████▏   | 3091/5000 [12:20:28<7:33:49, 14.26s/it]Train:  62%|██████▏   | 3092/5000 [12:20:42<7:33:29, 14.26s/it]Train:  62%|██████▏   | 3093/5000 [12:20:56<7:33:16, 14.26s/it]Train:  62%|██████▏   | 3094/5000 [12:21:11<7:32:58, 14.26s/it]Train:  62%|██████▏   | 3095/5000 [12:21:25<7:32:21, 14.25s/it]Train:  62%|██████▏   | 3096/5000 [12:21:39<7:32:25, 14.26s/it]Train:  62%|██████▏   | 3097/5000 [12:21:53<7:32:08, 14.26s/it]Train:  62%|██████▏   | 3098/5000 [12:22:08<7:31:57, 14.26s/it]Train:  62%|██████▏   | 3099/5000 [12:22:22<7:32:01, 14.27s/it]Train:  62%|██████▏   | 3100/5000 [12:22:36<7:32:15, 14.28s/it]                                                               {'loss': 1.66921654, 'token_acc': 0.63663311, 'grad_norm': 0.23444308, 'learning_rate': 6.91e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069551, 'epoch': 1.24, 'global_step/max_steps': '3100/5000', 'percentage': '62.00%', 'elapsed_time': '12h 22m 36s', 'remaining_time': '7h 35m 8s'}
+Train:  62%|██████▏   | 3100/5000 [12:22:36<7:32:15, 14.28s/it]Train:  62%|██████▏   | 3100/5000 [12:22:36<7:32:15, 14.28s/it]Train:  62%|██████▏   | 3101/5000 [12:22:51<7:31:52, 14.28s/it]Train:  62%|██████▏   | 3102/5000 [12:23:05<7:31:30, 14.27s/it]Train:  62%|██████▏   | 3103/5000 [12:23:19<7:31:15, 14.27s/it]Train:  62%|██████▏   | 3104/5000 [12:23:33<7:31:11, 14.28s/it]Train:  62%|██████▏   | 3105/5000 [12:23:48<7:31:03, 14.28s/it]Train:  62%|██████▏   | 3106/5000 [12:24:02<7:30:47, 14.28s/it]Train:  62%|██████▏   | 3107/5000 [12:24:16<7:30:55, 14.29s/it]Train:  62%|██████▏   | 3108/5000 [12:24:31<7:30:29, 14.29s/it]Train:  62%|██████▏   | 3109/5000 [12:24:45<7:29:45, 14.27s/it]Train:  62%|██████▏   | 3110/5000 [12:24:59<7:29:28, 14.27s/it]                                                               {'loss': 1.66758995, 'token_acc': 0.63631855, 'grad_norm': 0.23497057, 'learning_rate': 6.85e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069552, 'epoch': 1.24, 'global_step/max_steps': '3110/5000', 'percentage': '62.20%', 'elapsed_time': '12h 24m 59s', 'remaining_time': '7h 32m 44s'}
+Train:  62%|██████▏   | 3110/5000 [12:24:59<7:29:28, 14.27s/it]Train:  62%|██████▏   | 3110/5000 [12:24:59<7:29:28, 14.27s/it]Train:  62%|██████▏   | 3111/5000 [12:25:13<7:29:13, 14.27s/it]Train:  62%|██████▏   | 3112/5000 [12:25:28<7:28:43, 14.26s/it]Train:  62%|██████▏   | 3113/5000 [12:25:42<7:28:14, 14.25s/it]Train:  62%|██████▏   | 3114/5000 [12:25:56<7:28:15, 14.26s/it]Train:  62%|██████▏   | 3115/5000 [12:26:10<7:28:00, 14.26s/it]Train:  62%|██████▏   | 3116/5000 [12:26:25<7:27:50, 14.26s/it]Train:  62%|██████▏   | 3117/5000 [12:26:39<7:27:58, 14.27s/it]Train:  62%|██████▏   | 3118/5000 [12:26:53<7:27:44, 14.27s/it]Train:  62%|██████▏   | 3119/5000 [12:27:07<7:27:25, 14.27s/it]Train:  62%|██████▏   | 3120/5000 [12:27:22<7:27:25, 14.28s/it]                                                               {'loss': 1.66058521, 'token_acc': 0.6380211, 'grad_norm': 0.2380119, 'learning_rate': 6.78e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069554, 'epoch': 1.24, 'global_step/max_steps': '3120/5000', 'percentage': '62.40%', 'elapsed_time': '12h 27m 22s', 'remaining_time': '7h 30m 20s'}
+Train:  62%|██████▏   | 3120/5000 [12:27:22<7:27:25, 14.28s/it]Train:  62%|██████▏   | 3120/5000 [12:27:22<7:27:25, 14.28s/it]Train:  62%|██████▏   | 3121/5000 [12:27:36<7:27:15, 14.28s/it]Train:  62%|██████▏   | 3122/5000 [12:27:50<7:27:04, 14.28s/it]Train:  62%|██████▏   | 3123/5000 [12:28:05<7:27:07, 14.29s/it]Train:  62%|██████▏   | 3124/5000 [12:28:19<7:27:08, 14.30s/it]Train:  62%|██████▎   | 3125/5000 [12:28:33<7:26:32, 14.29s/it]Train:  63%|██████▎   | 3126/5000 [12:28:47<7:26:03, 14.28s/it]Train:  63%|██████▎   | 3127/5000 [12:29:02<7:25:53, 14.28s/it]Train:  63%|██████▎   | 3128/5000 [12:29:16<7:25:29, 14.28s/it]Train:  63%|██████▎   | 3129/5000 [12:29:30<7:25:26, 14.28s/it]Train:  63%|██████▎   | 3130/5000 [12:29:45<7:25:17, 14.29s/it]                                                               {'loss': 1.67164764, 'token_acc': 0.6292085, 'grad_norm': 0.2400225, 'learning_rate': 6.72e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069556, 'epoch': 1.24, 'global_step/max_steps': '3130/5000', 'percentage': '62.60%', 'elapsed_time': '12h 29m 45s', 'remaining_time': '7h 27m 56s'}
+Train:  63%|██████▎   | 3130/5000 [12:29:45<7:25:17, 14.29s/it]Train:  63%|██████▎   | 3130/5000 [12:29:45<7:25:17, 14.29s/it]Train:  63%|██████▎   | 3131/5000 [12:29:59<7:25:00, 14.29s/it]Train:  63%|██████▎   | 3132/5000 [12:30:13<7:24:52, 14.29s/it]Train:  63%|██████▎   | 3133/5000 [12:30:27<7:24:33, 14.29s/it]Train:  63%|██████▎   | 3134/5000 [12:30:42<7:24:03, 14.28s/it]Train:  63%|██████▎   | 3135/5000 [12:30:56<7:23:54, 14.28s/it]Train:  63%|██████▎   | 3136/5000 [12:31:10<7:23:43, 14.28s/it]Train:  63%|██████▎   | 3137/5000 [12:31:25<7:23:01, 14.27s/it]Train:  63%|██████▎   | 3138/5000 [12:31:39<7:23:11, 14.28s/it]Train:  63%|██████▎   | 3139/5000 [12:31:53<7:23:13, 14.29s/it]Train:  63%|██████▎   | 3140/5000 [12:32:07<7:23:02, 14.29s/it]                                                               {'loss': 1.66556797, 'token_acc': 0.63109133, 'grad_norm': 0.22725484, 'learning_rate': 6.66e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069557, 'epoch': 1.25, 'global_step/max_steps': '3140/5000', 'percentage': '62.80%', 'elapsed_time': '12h 32m 7s', 'remaining_time': '7h 25m 31s'}
+Train:  63%|██████▎   | 3140/5000 [12:32:07<7:23:02, 14.29s/it]Train:  63%|██████▎   | 3140/5000 [12:32:07<7:23:02, 14.29s/it]Train:  63%|██████▎   | 3141/5000 [12:32:22<7:22:29, 14.28s/it]Train:  63%|██████▎   | 3142/5000 [12:32:36<7:22:47, 14.30s/it]Train:  63%|██████▎   | 3143/5000 [12:32:50<7:22:37, 14.30s/it]Train:  63%|██████▎   | 3144/5000 [12:33:05<7:22:39, 14.31s/it]Train:  63%|██████▎   | 3145/5000 [12:33:19<7:22:12, 14.30s/it]Train:  63%|██████▎   | 3146/5000 [12:33:33<7:21:54, 14.30s/it]Train:  63%|██████▎   | 3147/5000 [12:33:48<7:21:30, 14.30s/it]Train:  63%|██████▎   | 3148/5000 [12:34:02<7:21:06, 14.29s/it]Train:  63%|██████▎   | 3149/5000 [12:34:16<7:20:12, 14.27s/it]Train:  63%|██████▎   | 3150/5000 [12:34:30<7:20:02, 14.27s/it]                                                               {'loss': 1.65887756, 'token_acc': 0.63419742, 'grad_norm': 0.23302642, 'learning_rate': 6.6e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069558, 'epoch': 1.25, 'global_step/max_steps': '3150/5000', 'percentage': '63.00%', 'elapsed_time': '12h 34m 30s', 'remaining_time': '7h 23m 7s'}
+Train:  63%|██████▎   | 3150/5000 [12:34:30<7:20:02, 14.27s/it]Train:  63%|██████▎   | 3150/5000 [12:34:30<7:20:02, 14.27s/it]Train:  63%|██████▎   | 3151/5000 [12:34:45<7:20:27, 14.29s/it]Train:  63%|██████▎   | 3152/5000 [12:34:59<7:20:22, 14.30s/it]Train:  63%|██████▎   | 3153/5000 [12:35:13<7:19:53, 14.29s/it]Train:  63%|██████▎   | 3154/5000 [12:35:28<7:19:34, 14.29s/it]Train:  63%|██████▎   | 3155/5000 [12:35:42<7:18:56, 14.27s/it]Train:  63%|██████▎   | 3156/5000 [12:35:56<7:18:51, 14.28s/it]Train:  63%|██████▎   | 3157/5000 [12:36:10<7:18:59, 14.29s/it]Train:  63%|██████▎   | 3158/5000 [12:36:25<7:18:14, 14.28s/it]Train:  63%|██████▎   | 3159/5000 [12:36:39<7:17:47, 14.27s/it]Train:  63%|██████▎   | 3160/5000 [12:36:53<7:17:33, 14.27s/it]                                                               {'loss': 1.66568413, 'token_acc': 0.63401858, 'grad_norm': 0.23587565, 'learning_rate': 6.53e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06956, 'epoch': 1.25, 'global_step/max_steps': '3160/5000', 'percentage': '63.20%', 'elapsed_time': '12h 36m 53s', 'remaining_time': '7h 20m 43s'}
+Train:  63%|██████▎   | 3160/5000 [12:36:53<7:17:33, 14.27s/it]Train:  63%|██████▎   | 3160/5000 [12:36:53<7:17:33, 14.27s/it]Train:  63%|██████▎   | 3161/5000 [12:37:07<7:17:23, 14.27s/it]Train:  63%|██████▎   | 3162/5000 [12:37:22<7:17:20, 14.28s/it]Train:  63%|██████▎   | 3163/5000 [12:37:36<7:16:45, 14.27s/it]Train:  63%|██████▎   | 3164/5000 [12:37:50<7:16:19, 14.26s/it]Train:  63%|██████▎   | 3165/5000 [12:38:05<7:16:39, 14.28s/it]Train:  63%|██████▎   | 3166/5000 [12:38:19<7:16:23, 14.28s/it]Train:  63%|██████▎   | 3167/5000 [12:38:33<7:16:17, 14.28s/it]Train:  63%|██████▎   | 3168/5000 [12:38:47<7:15:52, 14.28s/it]Train:  63%|██████▎   | 3169/5000 [12:39:02<7:15:44, 14.28s/it]Train:  63%|██████▎   | 3170/5000 [12:39:16<7:15:46, 14.29s/it]                                                               {'loss': 1.67072487, 'token_acc': 0.62955188, 'grad_norm': 0.2453187, 'learning_rate': 6.47e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069561, 'epoch': 1.25, 'global_step/max_steps': '3170/5000', 'percentage': '63.40%', 'elapsed_time': '12h 39m 16s', 'remaining_time': '7h 18m 19s'}
+Train:  63%|██████▎   | 3170/5000 [12:39:16<7:15:46, 14.29s/it]Train:  63%|██████▎   | 3170/5000 [12:39:16<7:15:46, 14.29s/it]Train:  63%|██████▎   | 3171/5000 [12:39:30<7:15:08, 14.27s/it]Train:  63%|██████▎   | 3172/5000 [12:39:44<7:14:28, 14.26s/it]Train:  63%|██████▎   | 3173/5000 [12:39:59<7:14:16, 14.26s/it]Train:  63%|██████▎   | 3174/5000 [12:40:13<7:14:03, 14.26s/it]Train:  64%|██████▎   | 3175/5000 [12:40:27<7:13:56, 14.27s/it]Train:  64%|██████▎   | 3176/5000 [12:40:41<7:13:44, 14.27s/it]Train:  64%|██████▎   | 3177/5000 [12:40:56<7:13:25, 14.26s/it]Train:  64%|██████▎   | 3178/5000 [12:41:10<7:13:25, 14.27s/it]Train:  64%|██████▎   | 3179/5000 [12:41:24<7:12:52, 14.26s/it]Train:  64%|██████▎   | 3180/5000 [12:41:39<7:12:41, 14.26s/it]                                                               {'loss': 1.66901665, 'token_acc': 0.6261399, 'grad_norm': 0.23751093, 'learning_rate': 6.41e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069563, 'epoch': 1.25, 'global_step/max_steps': '3180/5000', 'percentage': '63.60%', 'elapsed_time': '12h 41m 39s', 'remaining_time': '7h 15m 54s'}
+Train:  64%|██████▎   | 3180/5000 [12:41:39<7:12:41, 14.26s/it]Train:  64%|██████▎   | 3180/5000 [12:41:39<7:12:41, 14.26s/it]Train:  64%|██████▎   | 3181/5000 [12:41:53<7:12:25, 14.26s/it]Train:  64%|██████▎   | 3182/5000 [12:42:07<7:12:12, 14.26s/it]Train:  64%|██████▎   | 3183/5000 [12:42:21<7:11:39, 14.25s/it]Train:  64%|██████▎   | 3184/5000 [12:42:36<7:11:29, 14.26s/it]Train:  64%|██████▎   | 3185/5000 [12:42:50<7:11:22, 14.26s/it]Train:  64%|██████▎   | 3186/5000 [12:43:04<7:11:11, 14.26s/it]Train:  64%|██████▎   | 3187/5000 [12:43:18<7:11:04, 14.27s/it]Train:  64%|██████▍   | 3188/5000 [12:43:33<7:10:43, 14.26s/it]Train:  64%|██████▍   | 3189/5000 [12:43:47<7:10:23, 14.26s/it]Train:  64%|██████▍   | 3190/5000 [12:44:01<7:10:13, 14.26s/it]                                                               {'loss': 1.6643343, 'token_acc': 0.63970275, 'grad_norm': 0.24223742, 'learning_rate': 6.35e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069565, 'epoch': 1.26, 'global_step/max_steps': '3190/5000', 'percentage': '63.80%', 'elapsed_time': '12h 44m 1s', 'remaining_time': '7h 13m 30s'}
+Train:  64%|██████▍   | 3190/5000 [12:44:01<7:10:13, 14.26s/it]Train:  64%|██████▍   | 3190/5000 [12:44:01<7:10:13, 14.26s/it]Train:  64%|██████▍   | 3191/5000 [12:44:15<7:10:03, 14.26s/it]Train:  64%|██████▍   | 3192/5000 [12:44:30<7:09:53, 14.27s/it]Train:  64%|██████▍   | 3193/5000 [12:44:44<7:09:54, 14.27s/it]Train:  64%|██████▍   | 3194/5000 [12:44:58<7:09:31, 14.27s/it]Train:  64%|██████▍   | 3195/5000 [12:45:13<7:09:16, 14.27s/it]Train:  64%|██████▍   | 3196/5000 [12:45:27<7:08:59, 14.27s/it]Train:  64%|██████▍   | 3197/5000 [12:45:41<7:08:52, 14.27s/it]Train:  64%|██████▍   | 3198/5000 [12:45:55<7:08:35, 14.27s/it]Train:  64%|██████▍   | 3199/5000 [12:46:10<7:08:08, 14.26s/it]Train:  64%|██████▍   | 3200/5000 [12:46:24<7:07:50, 14.26s/it]                                                               {'loss': 1.66419334, 'token_acc': 0.63431484, 'grad_norm': 0.24099705, 'learning_rate': 6.29e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069566, 'epoch': 1.26, 'global_step/max_steps': '3200/5000', 'percentage': '64.00%', 'elapsed_time': '12h 46m 24s', 'remaining_time': '7h 11m 6s'}
+Train:  64%|██████▍   | 3200/5000 [12:46:24<7:07:50, 14.26s/it]Train:  64%|██████▍   | 3200/5000 [12:46:24<7:07:50, 14.26s/it]Train:  64%|██████▍   | 3201/5000 [12:46:38<7:07:45, 14.27s/it]Train:  64%|██████▍   | 3202/5000 [12:46:52<7:07:42, 14.27s/it]Train:  64%|██████▍   | 3203/5000 [12:47:07<7:07:21, 14.27s/it]Train:  64%|██████▍   | 3204/5000 [12:47:21<7:06:41, 14.25s/it]Train:  64%|██████▍   | 3205/5000 [12:47:35<7:06:39, 14.26s/it]Train:  64%|██████▍   | 3206/5000 [12:47:49<7:06:15, 14.26s/it]Train:  64%|██████▍   | 3207/5000 [12:48:04<7:05:54, 14.25s/it]Train:  64%|██████▍   | 3208/5000 [12:48:18<7:05:34, 14.25s/it]Train:  64%|██████▍   | 3209/5000 [12:48:32<7:05:46, 14.26s/it]Train:  64%|██████▍   | 3210/5000 [12:48:46<7:05:40, 14.27s/it]                                                               {'loss': 1.67386627, 'token_acc': 0.63275724, 'grad_norm': 0.23764347, 'learning_rate': 6.23e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069568, 'epoch': 1.26, 'global_step/max_steps': '3210/5000', 'percentage': '64.20%', 'elapsed_time': '12h 48m 46s', 'remaining_time': '7h 8m 41s'}
+Train:  64%|██████▍   | 3210/5000 [12:48:46<7:05:40, 14.27s/it]Train:  64%|██████▍   | 3210/5000 [12:48:46<7:05:40, 14.27s/it]Train:  64%|██████▍   | 3211/5000 [12:49:01<7:05:58, 14.29s/it]Train:  64%|██████▍   | 3212/5000 [12:49:15<7:05:43, 14.29s/it]Train:  64%|██████▍   | 3213/5000 [12:49:29<7:04:58, 14.27s/it]Train:  64%|██████▍   | 3214/5000 [12:49:44<7:04:56, 14.28s/it]Train:  64%|██████▍   | 3215/5000 [12:49:58<7:04:42, 14.28s/it]Train:  64%|██████▍   | 3216/5000 [12:50:12<7:04:03, 14.26s/it]Train:  64%|██████▍   | 3217/5000 [12:50:26<7:03:53, 14.26s/it]Train:  64%|██████▍   | 3218/5000 [12:50:41<7:03:37, 14.26s/it]Train:  64%|██████▍   | 3219/5000 [12:50:55<7:03:25, 14.26s/it]Train:  64%|██████▍   | 3220/5000 [12:51:09<7:02:52, 14.25s/it]                                                               {'loss': 1.66957169, 'token_acc': 0.62959655, 'grad_norm': 0.236853, 'learning_rate': 6.17e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.06957, 'epoch': 1.26, 'global_step/max_steps': '3220/5000', 'percentage': '64.40%', 'elapsed_time': '12h 51m 9s', 'remaining_time': '7h 6m 17s'}
+Train:  64%|██████▍   | 3220/5000 [12:51:09<7:02:52, 14.25s/it]Train:  64%|██████▍   | 3220/5000 [12:51:09<7:02:52, 14.25s/it]Train:  64%|██████▍   | 3221/5000 [12:51:23<7:03:08, 14.27s/it]Train:  64%|██████▍   | 3222/5000 [12:51:38<7:03:14, 14.28s/it]Train:  64%|██████▍   | 3223/5000 [12:51:52<7:02:47, 14.28s/it]Train:  64%|██████▍   | 3224/5000 [12:52:06<7:02:21, 14.27s/it]Train:  64%|██████▍   | 3225/5000 [12:52:21<7:02:09, 14.27s/it]Train:  65%|██████▍   | 3226/5000 [12:52:35<7:01:45, 14.26s/it]Train:  65%|██████▍   | 3227/5000 [12:52:49<7:01:21, 14.26s/it]Train:  65%|██████▍   | 3228/5000 [12:53:03<7:01:18, 14.27s/it]Train:  65%|██████▍   | 3229/5000 [12:53:18<7:00:53, 14.26s/it]Train:  65%|██████▍   | 3230/5000 [12:53:32<7:01:08, 14.28s/it]                                                               {'loss': 1.66130333, 'token_acc': 0.63282659, 'grad_norm': 0.2378826, 'learning_rate': 6.1e-06, 'memory(GiB)': 129.56, 'train_speed(iter/s)': 0.069571, 'epoch': 1.26, 'global_step/max_steps': '3230/5000', 'percentage': '64.60%', 'elapsed_time': '12h 53m 32s', 'remaining_time': '7h 3m 53s'}
+Train:  65%|██████▍   | 3230/5000 [12:53:32<7:01:08, 14.28s/it]Train:  65%|██████▍   | 3230/5000 [12:53:32<7:01:08, 14.28s/it]Train:  65%|██████▍   | 3231/5000 [12:53:46<7:00:38, 14.27s/it]Train:  65%|██████▍   | 3232/5000 [12:54:00<7:00:36, 14.27s/it]Train:  65%|██████▍   | 3233/5000 [12:54:15<7:00:32, 14.28s/it]Train:  65%|██████▍   | 3234/5000 [12:54:29<7:00:17, 14.28s/it]
\ No newline at end of file