hongzhuyi commited on Sep 17, 2025

Commit

aa77bde

verified ·

1 Parent(s): 5589c57

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

log/20250917-13:20:02.log +0 -0
log/20250917-13:25:53.log +676 -0
log/20250917-13:35:09.log +676 -0
log/20250917-13:41:16.log +676 -0
log/20250917-13:44:32.log +676 -0
log/20250917-13:46:26.log +675 -0
log/20250917-13:49:21.log +0 -0
v5-20250917-134655/args.json +384 -0
v5-20250917-134655/images/train_epoch.png +0 -0
v5-20250917-134655/images/train_grad_norm.png +0 -0
v5-20250917-134655/images/train_learning_rate.png +0 -0
v5-20250917-134655/images/train_loss.png +0 -0
v5-20250917-134655/images/train_token_acc.png +0 -0
v5-20250917-134655/logging.jsonl +3 -0
v5-20250917-134655/runs/events.out.tfevents.1758088071.TENCENT64.site.218247.0 +3 -0
v5-20250917-134655/val_dataset.jsonl +0 -0
v6-20250917-134949/args.json +384 -0
v6-20250917-134949/logging.jsonl +171 -0
v6-20250917-134949/runs/events.out.tfevents.1758088221.TENCENT64.site.222971.0 +3 -0
v6-20250917-134949/val_dataset.jsonl +0 -0

log/20250917-13:20:02.log ADDED Viewed

The diff for this file is too large to render. See raw diff

log/20250917-13:25:53.log ADDED Viewed

	@@ -0,0 +1,676 @@

+run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:10,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:11,277] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:26:11,772] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:11,781] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:12,631] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:12,640] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:26:12,992] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:14,391] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:14,400] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:26:14,793] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] Setting args.lazy_tokenize: False
+[2025-09-17 13:26:16,173] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:16,182] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
+[2025-09-17 13:26:16,449] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:17,770] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:17,779] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:26:17,779] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-09-17 13:26:18,258] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:19,673] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:19,683] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:26:20,035] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:21,373] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:21,381] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:26:21,413] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:26:22,788] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:26:22,801] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=True,
+agent_template=None,
+aligner_lr=None,
+attn_impl=None,
+auto_find_batch_size=False,
+average_tokens_across_devices=True,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=True,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=48,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
+dataset_num_proc=100,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=18000000,
+debug=None,
+deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+early_stop_interval=None,
+enable_dft_loss=False,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=2000.0,
+eval_strategy=epoch,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+extra_eval_args=None,
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=False,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_revision=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=5e-06,
+length_column_name=length,
+liger_kernel_config=None,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=1,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=16240,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=Qwen/Qwen2.5-7B-Instruct,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen2_5,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=2.0,
+optim=adamw_torch_fused,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625,
+overwrite_output_dir=False,
+packing=False,
+packing_length=None,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=2,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=epoch,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.001,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen2_5,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_flash_ckpt=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=[],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:28,459] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] model_kwargs: {'device_map': None}
+[2025-09-17 13:26:30,183] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:31,829] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:33,474] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:35,123] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:36,825] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:38,513] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:26:40,270] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+[2025-09-17 13:26:40,417] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
+[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_new_tokens": 64,
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05
+}
+[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+[INFO:swift] max_length: 16240
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-17 13:26:42.816518
+[INFO:swift] swift.__version__: 3.8.0.dev0
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank1]:     sft_main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank1]:     return SwiftSft(args).main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank1]:     result = self.run()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank1]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank1]:     train_dataset, val_dataset = self._get_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank1]:     train_dataset, val_dataset = load_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank1]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank1]:     dataset = DatasetLoader._load_repo_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank1]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank6]:     sft_main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank6]:     return SwiftSft(args).main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank6]:     result = self.run()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank6]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank6]:     train_dataset, val_dataset = self._get_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank6]:     train_dataset, val_dataset = load_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank6]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank6]:     dataset = DatasetLoader._load_repo_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank6]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank5]:     sft_main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank5]:     return SwiftSft(args).main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank5]:     result = self.run()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank5]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank5]:     train_dataset, val_dataset = self._get_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank5]:     train_dataset, val_dataset = load_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank5]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank5]:     dataset = DatasetLoader._load_repo_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank5]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank4]:     sft_main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank4]:     return SwiftSft(args).main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank4]:     result = self.run()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank4]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank4]:     train_dataset, val_dataset = self._get_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank4]:     train_dataset, val_dataset = load_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank4]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank4]:     dataset = DatasetLoader._load_repo_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank4]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank0]:     sft_main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank0]:     return SwiftSft(args).main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank0]:     result = self.run()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank0]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank0]:     train_dataset, val_dataset = self._get_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank0]:     train_dataset, val_dataset = load_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank0]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank0]:     dataset = DatasetLoader._load_repo_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank0]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank3]:     sft_main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank3]:     return SwiftSft(args).main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank3]:     result = self.run()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank3]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank3]:     train_dataset, val_dataset = self._get_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank3]:     train_dataset, val_dataset = load_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank3]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank3]:     dataset = DatasetLoader._load_repo_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank3]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank7]:     sft_main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank7]:     return SwiftSft(args).main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank7]:     result = self.run()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank7]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank7]:     train_dataset, val_dataset = self._get_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank7]:     train_dataset, val_dataset = load_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank7]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank7]:     dataset = DatasetLoader._load_repo_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank7]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank2]:     sft_main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank2]:     return SwiftSft(args).main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank2]:     result = self.run()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank2]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank2]:     train_dataset, val_dataset = self._get_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank2]:     train_dataset, val_dataset = load_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank2]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank2]:     dataset = DatasetLoader._load_repo_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank2]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank0]:[W917 13:26:48.475248646 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212634 closing signal SIGTERM
+W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212636 closing signal SIGTERM
+W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212637 closing signal SIGTERM
+W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212638 closing signal SIGTERM
+W0917 13:26:49.062000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212639 closing signal SIGTERM
+W0917 13:26:49.064000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212640 closing signal SIGTERM
+W0917 13:26:49.065000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212641 closing signal SIGTERM
+E0917 13:26:50.209000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 212635) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-17_13:26:49
+  host      : TENCENT64.site
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 212635)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

log/20250917-13:35:09.log ADDED Viewed

	@@ -0,0 +1,676 @@

+run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:23,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:24,740] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:24,749] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:35:25,072] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] Setting args.lazy_tokenize: False
+[2025-09-17 13:35:26,421] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:26,430] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
+[2025-09-17 13:35:26,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:28,235] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:28,244] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:35:28,244] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-09-17 13:35:28,524] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:29,859] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:29,867] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:35:30,411] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:32,085] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:32,094] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:35:32,252] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:33,593] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:33,600] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:35:33,602] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:34,920] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:34,928] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:35:35,271] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:35:36,578] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:35:36,586] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=True,
+agent_template=None,
+aligner_lr=None,
+attn_impl=None,
+auto_find_batch_size=False,
+average_tokens_across_devices=True,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=True,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=48,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
+dataset_num_proc=100,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=18000000,
+debug=None,
+deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+early_stop_interval=None,
+enable_dft_loss=False,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=2000.0,
+eval_strategy=epoch,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+extra_eval_args=None,
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=False,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_revision=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=5e-06,
+length_column_name=length,
+liger_kernel_config=None,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=1,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=16240,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=Qwen/Qwen2.5-7B-Instruct,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen2_5,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=2.0,
+optim=adamw_torch_fused,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538,
+overwrite_output_dir=False,
+packing=False,
+packing_length=None,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=2,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=epoch,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.001,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen2_5,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_flash_ckpt=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=[],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:42,156] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:43,747] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:45,419] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] model_kwargs: {'device_map': None}
+[2025-09-17 13:35:47,059] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:48,636] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:50,272] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:51,903] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:35:53,570] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+[2025-09-17 13:35:53,719] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank3]:     sft_main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank3]:     return SwiftSft(args).main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank3]:     result = self.run()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank3]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank3]:     train_dataset, val_dataset = self._get_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank3]:     train_dataset, val_dataset = load_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank3]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank3]:     dataset = DatasetLoader._load_repo_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank3]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank5]:     sft_main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank5]:     return SwiftSft(args).main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank5]:     result = self.run()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank5]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank5]:     train_dataset, val_dataset = self._get_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank5]:     train_dataset, val_dataset = load_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank5]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank5]:     dataset = DatasetLoader._load_repo_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank5]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank2]:     sft_main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank2]:     return SwiftSft(args).main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank2]:     result = self.run()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank2]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank2]:     train_dataset, val_dataset = self._get_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank2]:     train_dataset, val_dataset = load_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank2]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank2]:     dataset = DatasetLoader._load_repo_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank2]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank7]:     sft_main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank7]:     return SwiftSft(args).main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank7]:     result = self.run()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank7]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank7]:     train_dataset, val_dataset = self._get_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank7]:     train_dataset, val_dataset = load_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank7]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank7]:     dataset = DatasetLoader._load_repo_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank7]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank4]:     sft_main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank4]:     return SwiftSft(args).main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank4]:     result = self.run()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank4]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank4]:     train_dataset, val_dataset = self._get_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank4]:     train_dataset, val_dataset = load_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank4]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank4]:     dataset = DatasetLoader._load_repo_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank4]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank1]:     sft_main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank1]:     return SwiftSft(args).main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank1]:     result = self.run()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank1]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank1]:     train_dataset, val_dataset = self._get_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank1]:     train_dataset, val_dataset = load_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank1]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank1]:     dataset = DatasetLoader._load_repo_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank1]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank6]:     sft_main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank6]:     return SwiftSft(args).main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank6]:     result = self.run()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank6]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank6]:     train_dataset, val_dataset = self._get_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank6]:     train_dataset, val_dataset = load_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank6]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank6]:     dataset = DatasetLoader._load_repo_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank6]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_new_tokens": 64,
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05
+}
+[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+[INFO:swift] max_length: 16240
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-17 13:35:55.854352
+[INFO:swift] swift.__version__: 3.8.0.dev0
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank0]:     sft_main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank0]:     return SwiftSft(args).main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank0]:     result = self.run()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank0]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank0]:     train_dataset, val_dataset = self._get_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank0]:     train_dataset, val_dataset = load_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank0]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank0]:     dataset = DatasetLoader._load_repo_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank0]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
+[rank0]:[W917 13:35:56.111200305 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+W0917 13:35:57.757000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214064 closing signal SIGTERM
+W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214065 closing signal SIGTERM
+W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214067 closing signal SIGTERM
+W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214068 closing signal SIGTERM
+W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214069 closing signal SIGTERM
+W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214070 closing signal SIGTERM
+W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214071 closing signal SIGTERM
+E0917 13:35:58.956000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 2 (pid: 214066) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-17_13:35:57
+  host      : TENCENT64.site
+  rank      : 2 (local_rank: 2)
+  exitcode  : 1 (pid: 214066)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

log/20250917-13:41:16.log ADDED Viewed

	@@ -0,0 +1,676 @@

+run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:29,202] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
+[2025-09-17 13:41:30,559] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:30,567] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:41:30,594] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:31,893] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:31,902] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:41:31,902] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-09-17 13:41:32,524] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:34,115] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:34,123] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:41:34,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:35,568] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:35,577] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:41:35,847] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:37,286] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:37,294] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:41:37,640] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:38,974] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:41:38,986] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:38,995] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:40,341] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:40,350] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:41:40,690] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:41:41,989] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:41:41,999] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=True,
+agent_template=None,
+aligner_lr=None,
+attn_impl=None,
+auto_find_batch_size=False,
+average_tokens_across_devices=True,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=True,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=48,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
+dataset_num_proc=100,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=18000000,
+debug=None,
+deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+early_stop_interval=None,
+enable_dft_loss=False,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=2000.0,
+eval_strategy=epoch,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+extra_eval_args=None,
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=False,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_revision=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=5e-06,
+length_column_name=length,
+liger_kernel_config=None,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=1,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=16240,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=Qwen/Qwen2.5-7B-Instruct,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen2_5,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=2.0,
+optim=adamw_torch_fused,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144,
+overwrite_output_dir=False,
+packing=False,
+packing_length=None,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=2,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=epoch,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.001,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen2_5,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_flash_ckpt=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=[],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:47,308] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] model_kwargs: {'device_map': None}
+[2025-09-17 13:41:48,909] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:50,527] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:52,236] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:53,964] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:55,616] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:57,230] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:41:58,901] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+[2025-09-17 13:41:59,122] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
+[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_new_tokens": 64,
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05
+}
+[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+[INFO:swift] max_length: 16240
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-17 13:42:01.223333
+[INFO:swift] swift.__version__: 3.8.0.dev0
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank0]:     sft_main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank0]:     return SwiftSft(args).main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank0]:     result = self.run()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank0]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank0]:     train_dataset, val_dataset = self._get_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank0]:     train_dataset, val_dataset = load_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank0]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank0]:     dataset = DatasetLoader._load_repo_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank0]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank3]:     sft_main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank3]:     return SwiftSft(args).main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank3]:     result = self.run()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank3]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank3]:     train_dataset, val_dataset = self._get_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank3]:     train_dataset, val_dataset = load_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank3]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank3]:     dataset = DatasetLoader._load_repo_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank3]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank2]:     sft_main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank2]:     return SwiftSft(args).main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank2]:     result = self.run()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank2]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank2]:     train_dataset, val_dataset = self._get_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank2]:     train_dataset, val_dataset = load_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank2]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank2]:     dataset = DatasetLoader._load_repo_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank2]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank6]:     sft_main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank6]:     return SwiftSft(args).main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank6]:     result = self.run()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank6]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank6]:     train_dataset, val_dataset = self._get_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank6]:     train_dataset, val_dataset = load_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank6]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank6]:     dataset = DatasetLoader._load_repo_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank6]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank7]:     sft_main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank7]:     return SwiftSft(args).main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank7]:     result = self.run()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank7]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank7]:     train_dataset, val_dataset = self._get_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank7]:     train_dataset, val_dataset = load_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank7]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank7]:     dataset = DatasetLoader._load_repo_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank7]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank5]:     sft_main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank5]:     return SwiftSft(args).main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank5]:     result = self.run()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank5]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank5]:     train_dataset, val_dataset = self._get_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank5]:     train_dataset, val_dataset = load_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank5]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank5]:     dataset = DatasetLoader._load_repo_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank5]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank4]:     sft_main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank4]:     return SwiftSft(args).main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank4]:     result = self.run()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank4]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank4]:     train_dataset, val_dataset = self._get_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank4]:     train_dataset, val_dataset = load_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank4]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank4]:     dataset = DatasetLoader._load_repo_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank4]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank1]:     sft_main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank1]:     return SwiftSft(args).main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank1]:     result = self.run()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank1]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank1]:     train_dataset, val_dataset = self._get_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank1]:     train_dataset, val_dataset = load_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank1]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank1]:     dataset = DatasetLoader._load_repo_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank1]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank0]:[W917 13:42:04.042692069 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+W0917 13:42:05.588000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215461 closing signal SIGTERM
+W0917 13:42:05.589000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215462 closing signal SIGTERM
+W0917 13:42:05.590000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215464 closing signal SIGTERM
+W0917 13:42:05.590000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215465 closing signal SIGTERM
+W0917 13:42:05.591000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215466 closing signal SIGTERM
+W0917 13:42:05.592000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215467 closing signal SIGTERM
+W0917 13:42:05.592000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215468 closing signal SIGTERM
+E0917 13:42:06.625000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 2 (pid: 215463) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-17_13:42:05
+  host      : TENCENT64.site
+  rank      : 2 (local_rank: 2)
+  exitcode  : 1 (pid: 215463)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

log/20250917-13:44:32.log ADDED Viewed

	@@ -0,0 +1,676 @@

+run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:46,353] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:47,709] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:47,717] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:44:47,876] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:49,192] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:49,200] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:49,878] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] Setting args.lazy_tokenize: False
+[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
+[2025-09-17 13:44:51,216] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:51,225] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:44:51,315] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:52,668] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:52,676] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:44:52,676] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-09-17 13:44:53,216] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:54,529] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:54,538] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:44:54,907] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:56,316] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:56,325] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:44:56,562] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:44:57,878] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:57,886] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:44:58,181] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:44:59,565] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:44:59,574] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=True,
+agent_template=None,
+aligner_lr=None,
+attn_impl=None,
+auto_find_batch_size=False,
+average_tokens_across_devices=True,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=True,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=48,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
+dataset_num_proc=100,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=18000000,
+debug=None,
+deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+early_stop_interval=None,
+enable_dft_loss=False,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=2000.0,
+eval_strategy=epoch,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+extra_eval_args=None,
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=False,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_revision=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=5e-06,
+length_column_name=length,
+liger_kernel_config=None,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=1,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=16240,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=Qwen/Qwen2.5-7B-Instruct,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen2_5,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=2.0,
+optim=adamw_torch_fused,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501,
+overwrite_output_dir=False,
+packing=False,
+packing_length=None,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=2,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=epoch,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.001,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen2_5,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_flash_ckpt=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=[],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:05,431] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:07,082] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:08,746] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] model_kwargs: {'device_map': None}
+[2025-09-17 13:45:10,258] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:11,990] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:13,654] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:15,393] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:45:17,053] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+[2025-09-17 13:45:17,199] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
+[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_new_tokens": 64,
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05
+}
+[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+[INFO:swift] max_length: 16240
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-17 13:45:19.304293
+[INFO:swift] swift.__version__: 3.8.0.dev0
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank0]:     sft_main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank0]:     return SwiftSft(args).main()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank0]:     result = self.run()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank0]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank0]:     train_dataset, val_dataset = self._get_dataset()
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank0]:     train_dataset, val_dataset = load_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank0]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank0]:     dataset = DatasetLoader._load_repo_dataset(
+[rank0]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank0]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank7]:     sft_main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank7]:     return SwiftSft(args).main()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank7]:     result = self.run()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank7]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank7]:     train_dataset, val_dataset = self._get_dataset()
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank7]:     train_dataset, val_dataset = load_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank7]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank7]:     dataset = DatasetLoader._load_repo_dataset(
+[rank7]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank7]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank1]:     sft_main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank1]:     return SwiftSft(args).main()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank1]:     result = self.run()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank1]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank1]:     train_dataset, val_dataset = self._get_dataset()
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank1]:     train_dataset, val_dataset = load_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank1]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank1]:     dataset = DatasetLoader._load_repo_dataset(
+[rank1]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank1]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank4]:     sft_main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank4]:     return SwiftSft(args).main()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank4]:     result = self.run()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank4]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank4]:     train_dataset, val_dataset = self._get_dataset()
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank4]:     train_dataset, val_dataset = load_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank4]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank4]:     dataset = DatasetLoader._load_repo_dataset(
+[rank4]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank4]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank5]:     sft_main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank5]:     return SwiftSft(args).main()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank5]:     result = self.run()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank5]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank5]:     train_dataset, val_dataset = self._get_dataset()
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank5]:     train_dataset, val_dataset = load_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank5]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank5]:     dataset = DatasetLoader._load_repo_dataset(
+[rank5]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank5]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank3]:     sft_main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank3]:     return SwiftSft(args).main()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank3]:     result = self.run()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank3]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank3]:     train_dataset, val_dataset = self._get_dataset()
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank3]:     train_dataset, val_dataset = load_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank3]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank3]:     dataset = DatasetLoader._load_repo_dataset(
+[rank3]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank3]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank2]:     sft_main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank2]:     return SwiftSft(args).main()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank2]:     result = self.run()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank2]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank2]:     train_dataset, val_dataset = self._get_dataset()
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank2]:     train_dataset, val_dataset = load_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank2]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank2]:     dataset = DatasetLoader._load_repo_dataset(
+[rank2]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank2]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
+[rank6]:     sft_main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
+[rank6]:     return SwiftSft(args).main()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
+[rank6]:     result = self.run()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
+[rank6]:     train_dataset, val_dataset = self._prepare_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
+[rank6]:     train_dataset, val_dataset = self._get_dataset()
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
+[rank6]:     train_dataset, val_dataset = load_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
+[rank6]:     train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
+[rank6]:     dataset = DatasetLoader._load_repo_dataset(
+[rank6]:   File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
+[rank6]:     raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
+[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
+[rank0]:[W917 13:45:22.970566457 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+W0917 13:45:23.570000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216883 closing signal SIGTERM
+W0917 13:45:23.570000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216884 closing signal SIGTERM
+W0917 13:45:23.573000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216885 closing signal SIGTERM
+W0917 13:45:23.575000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216886 closing signal SIGTERM
+W0917 13:45:23.575000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216888 closing signal SIGTERM
+W0917 13:45:23.577000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216889 closing signal SIGTERM
+W0917 13:45:23.577000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216890 closing signal SIGTERM
+E0917 13:45:24.622000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 4 (pid: 216887) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+    main()
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+    run(args)
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-17_13:45:23
+  host      : TENCENT64.site
+  rank      : 4 (local_rank: 4)
+  exitcode  : 1 (pid: 216887)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================

log/20250917-13:46:26.log ADDED Viewed

	@@ -0,0 +1,675 @@

































































































































































































































































































































































































































































































































































































































































































0

1

+run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
+[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:39,893] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:41,247] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:41,255] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:46:41,564] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] Setting args.lazy_tokenize: False
+[2025-09-17 13:46:42,898] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:42,906] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
+[2025-09-17 13:46:43,796] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:45,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:46:45,109] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:45,117] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:46:45,117] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:46,411] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:46,420] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:46:46,916] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:48,428] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:48,437] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:46:48,861] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:50,317] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:46:50,474] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:50,483] [INFO] [comm.py:821:init_distributed] cdb=None
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:51,692] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:51,701] [INFO] [comm.py:821:init_distributed] cdb=None
+[2025-09-17 13:46:51,819] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-09-17 13:46:53,148] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
+[2025-09-17 13:46:53,156] [INFO] [comm.py:821:init_distributed] cdb=None
+[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655
+[INFO:swift] Global seed set to 42
+[INFO:swift] args: TrainArguments(
+_n_gpu=-1,
+acc_strategy=token,
+accelerator_config={'dispatch_batches': False},
+adafactor=False,
+adalora_beta1=0.85,
+adalora_beta2=0.85,
+adalora_deltaT=1,
+adalora_init_r=12,
+adalora_orth_reg_weight=0.5,
+adalora_target_r=8,
+adalora_tfinal=0,
+adalora_tinit=0,
+adam_beta1=0.9,
+adam_beta2=0.95,
+adam_epsilon=1e-08,
+adapter_act=gelu,
+adapter_length=128,
+adapters=[],
+add_version=True,
+agent_template=None,
+aligner_lr=None,
+attn_impl=None,
+auto_find_batch_size=False,
+average_tokens_across_devices=True,
+batch_eval_metrics=False,
+bf16=True,
+bf16_full_eval=False,
+bnb_4bit_compute_dtype=torch.bfloat16,
+bnb_4bit_quant_storage=None,
+bnb_4bit_quant_type=nf4,
+bnb_4bit_use_double_quant=True,
+boft_block_num=0,
+boft_block_size=4,
+boft_dropout=0.0,
+boft_n_butterfly_factor=1,
+cached_dataset=[],
+channels=None,
+check_model=True,
+ckpt_dir=None,
+columns={},
+create_checkpoint_symlink=False,
+custom_dataset_info=[],
+custom_register_path=[],
+data_seed=42,
+dataloader_drop_last=False,
+dataloader_num_workers=48,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
+dataset_num_proc=100,
+dataset_shuffle=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=18000000,
+debug=None,
+deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
+deepspeed_autotp_size=None,
+device_map=None,
+disable_tqdm=None,
+do_eval=False,
+do_predict=False,
+do_train=False,
+download_mode=reuse_dataset_if_exists,
+ds3_gather_for_generation=True,
+early_stop_interval=None,
+enable_dft_loss=False,
+eval_accumulation_steps=None,
+eval_dataset=[],
+eval_dataset_args=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_generation_config=None,
+eval_limit=None,
+eval_on_start=False,
+eval_steps=2000.0,
+eval_strategy=epoch,
+eval_use_evalscope=False,
+eval_use_gather_object=False,
+external_plugins=[],
+extra_eval_args=None,
+fourier_n_frequency=2000,
+fourier_scaling=300.0,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+freeze_aligner=False,
+freeze_llm=False,
+freeze_parameters=[],
+freeze_parameters_ratio=0.0,
+freeze_parameters_regex=None,
+freeze_vit=True,
+fsdp=,
+fsdp_config=None,
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+galore_cos_threshold=0.4,
+galore_gamma_proj=2,
+galore_optim_per_parameter=False,
+galore_proj_bits=4,
+galore_proj_group_size=256,
+galore_proj_quant=False,
+galore_proj_type=std,
+galore_quantization=False,
+galore_queue_size=5,
+galore_rank=128,
+galore_scale=1.0,
+galore_target_modules=None,
+galore_update_proj_gap=50,
+galore_with_embedding=False,
+generation_config=None,
+generation_max_length=None,
+generation_num_beams=None,
+gradient_accumulation_steps=4,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=False,
+group_by_length=False,
+half_precision_backend=auto,
+hqq_axis=None,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=None,
+hub_revision=None,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_args_error=False,
+ignore_data_skip=False,
+include_for_metrics=[],
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+init_strategy=None,
+init_weights=True,
+interleave_prob=None,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+lazy_tokenize=False,
+learning_rate=5e-06,
+length_column_name=length,
+liger_kernel_config=None,
+lisa_activated_layers=0,
+lisa_step_interval=20,
+llamapro_num_groups=None,
+llamapro_num_new_blocks=4,
+load_args=False,
+load_best_model_at_end=False,
+load_data_args=False,
+load_from_cache_file=True,
+local_rank=0,
+local_repo_path=None,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/runs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=1,
+logging_strategy=steps,
+logprobs=False,
+lora_alpha=32,
+lora_bias=none,
+lora_dropout=0.05,
+lora_dtype=None,
+lora_ga_batch_size=2,
+lora_ga_direction=ArB2r,
+lora_ga_iters=2,
+lora_ga_max_length=1024,
+lora_ga_scale=stable,
+lora_ga_stable_gamma=16,
+lora_modules=[],
+lora_rank=8,
+lorap_lr_ratio=None,
+loss_scale=default,
+loss_type=None,
+lr_scheduler_kwargs=None,
+lr_scheduler_type=cosine,
+max_epochs=None,
+max_grad_norm=1.0,
+max_length=16240,
+max_memory={},
+max_model_len=None,
+max_new_tokens=64,
+max_pixels=None,
+max_steps=-1,
+metric=None,
+metric_for_best_model=loss,
+model=Qwen/Qwen2.5-7B-Instruct,
+model_author=None,
+model_kwargs={},
+model_name=None,
+model_revision=None,
+model_type=qwen2_5,
+modules_to_save=[],
+mp_parameters=,
+neftune_noise_alpha=None,
+new_special_tokens=[],
+no_cuda=False,
+norm_bbox=None,
+num_beams=1,
+num_labels=None,
+num_train_epochs=2.0,
+optim=adamw_torch_fused,
+optim_args=None,
+optim_target_modules=None,
+optimizer=None,
+output_dir=/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655,
+overwrite_output_dir=False,
+packing=False,
+packing_length=None,
+padding_free=False,
+padding_side=right,
+past_index=-1,
+per_device_eval_batch_size=1,
+per_device_train_batch_size=2,
+predict_with_generate=False,
+prediction_loss_only=False,
+problem_type=None,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+quant_bits=None,
+quant_method=None,
+ray_scope=last,
+reft_args=None,
+reft_intervention_type=LoreftIntervention,
+reft_layer_key=None,
+reft_layers=None,
+reft_rank=4,
+remove_unused_columns=True,
+repetition_penalty=None,
+report_to=['tensorboard'],
+response_prefix=None,
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+resume_only_model=False,
+rope_scaling=None,
+router_aux_loss_coef=0.0,
+run_name=/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=500,
+save_strategy=epoch,
+save_total_limit=None,
+seed=42,
+sequence_parallel_size=1,
+shuffle_buffer_size=1000,
+skip_memory_metrics=True,
+sortish_sampler=False,
+split_dataset_ratio=0.001,
+stop_words=[],
+stopping_strategy=first_exhausted,
+stream=False,
+streaming=False,
+strict=False,
+swanlab_exp_name=None,
+swanlab_lark_secret=None,
+swanlab_lark_webhook_url=None,
+swanlab_mode=cloud,
+swanlab_project=None,
+swanlab_token=<SWANLAB_TOKEN>,
+swanlab_workspace=None,
+system=None,
+target_modules=['all-linear'],
+target_regex=None,
+task_type=causal_lm,
+temperature=0.0,
+template=qwen2_5,
+template_backend=swift,
+tf32=None,
+top_k=None,
+top_logprobs=None,
+top_p=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torch_dtype=torch.bfloat16,
+torch_empty_cache_steps=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+train_dataloader_shuffle=True,
+train_type=full,
+trainable_parameters=[],
+trainable_parameters_regex=None,
+truncation_strategy=delete,
+tuner_backend=peft,
+use_chat_template=True,
+use_cpu=False,
+use_dora=False,
+use_flash_ckpt=False,
+use_galore=False,
+use_hf=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_liger_kernel=False,
+use_logits_to_keep=None,
+use_mps_device=False,
+use_rslora=False,
+use_swift_lora=False,
+val_dataset=[],
+val_dataset_shuffle=False,
+vera_d_initial=0.1,
+vera_dropout=0.0,
+vera_projection_prng_key=0,
+vera_rank=256,
+vit_gradient_checkpointing=None,
+vit_lr=None,
+warmup_ratio=0.05,
+warmup_steps=0,
+weight_decay=0.1,
+zero_hpz_partition_size=None,
+)
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:46:58,379] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[INFO:modelscope] Target directory already exists, skipping creation.
+[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
+[INFO:swift] model_kwargs: {'device_map': None}
+[2025-09-17 13:46:59,975] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:47:01,668] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:47:03,273] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:47:04,917] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:47:06,546] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:47:08,173] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
+[2025-09-17 13:47:09,832] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
+[2025-09-17 13:47:09,976] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
+[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
+, task_type='causal_lm', num_labels=None)
+[INFO:swift] model.generation_config: GenerationConfig {
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "max_new_tokens": 64,
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05
+}
+[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
+[INFO:swift] max_length: 16240
+[INFO:swift] response_prefix: ''
+[INFO:swift] agent_template: hermes
+[INFO:swift] Start time of running main: 2025-09-17 13:47:12.047609
+[INFO:swift] swift.__version__: 3.8.0.dev0
+Setting num_proc from 100 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+Setting num_proc from 100 back to 1 for the train split to disable multiprocessing as it only contains one shard.
+[INFO:swift] train_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 23973
+})
+[INFO:swift] val_dataset: Dataset({
+    features: ['messages'],
+    num_rows: 21
+})
+[INFO:swift] The split dataset from the training set will be saved at: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/val_dataset.jsonl.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
+[INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 264, 6929, 16230, 17847, 6188, 311, 9026, 3019, 14319, 29208, 6929, 7525, 29720, 323, 23638, 311, 4583, 279, 1196, 594, 3383, 13, 1446, 525, 3897, 448, 3151, 9079, 323, 44610, 13904, 1995, 11, 323, 498, 1184, 311, 2550, 13382, 6168, 311, 22054, 279, 1196, 594, 3383, 382, 8420, 594, 279, 1995, 498, 3278, 614, 510, 785, 1196, 594, 16538, 25, 1096, 374, 279, 3383, 498, 2299, 4460, 311, 4583, 624, 785, 1482, 3482, 2150, 594, 39700, 4916, 25, 1096, 374, 264, 43799, 13042, 315, 279, 44610, 11, 8241, 1376, 1995, 624, 785, 1787, 22398, 25, 4220, 525, 279, 22398, 498, 614, 1787, 624, 785, 3681, 6168, 25, 2619, 525, 279, 6168, 498, 1101, 10660, 13, 1084, 1231, 387, 10950, 311, 3754, 697, 5098, 382, 785, 6168, 498, 646, 2736, 4399, 1119, 3807, 11059, 1447, 2665, 16730, 26722, 510, 63, 3678, 508, 307, 60, 508, 1796, 60, 44622, 1096, 1917, 27749, 389, 458, 2392, 448, 264, 3151, 877, 389, 279, 44610, 624, 63, 1313, 508, 307, 60, 508, 1796, 60, 508, 1873, 37480, 19844, 28, 15, 91, 16, 60, 44622, 5443, 419, 311, 943, 279, 2213, 1119, 279, 2070, 448, 877, 13, 3216, 1638, 11, 279, 1591, 6269, 3014, 1376, 374, 17320, 1283, 19496, 7241, 3493, 37480, 19844, 374, 738, 311, 220, 15, 624, 63, 17583, 508, 307, 60, 508, 1796, 60, 44622, 85569, 916, 458, 2392, 448, 877, 624, 63, 1873, 508, 792, 34454, 60, 44622, 220, 4467, 23156, 279, 25352, 315, 264, 1376, 10601, 389, 279, 13625, 320, 68, 1302, 2572, 37014, 98267, 4292, 63, 12605, 508, 2923, 91, 454, 60, 44622, 22392, 279, 2150, 705, 476, 1495, 382, 8582, 9551, 26722, 510, 63, 931, 17344, 44622, 5264, 264, 501, 11, 4287, 6929, 5651, 624, 63, 6192, 47492, 508, 6192, 3560, 60, 44622, 15586, 279, 6929, 594, 5244, 311, 264, 3151, 5651, 1667, 1181, 1922, 624, 63, 5552, 17344, 44622, 13032, 279, 5023, 4541, 5651, 382, 3144, 17980, 26722, 510, 63, 28535, 508, 1085, 60, 44622, 81739, 311, 264, 3151, 5548, 624, 63, 3346, 3895, 44622, 81739, 311, 279, 8597, 19334, 2150, 624, 63, 3346, 32121, 44622, 81739, 311, 279, 1790, 2150, 320, 333, 264, 3681, 364, 3346, 3895, 6, 1917, 572, 10660, 3593, 33190, 5586, 510, 63, 9495, 508, 9217, 60, 44622, 25226, 419, 1917, 979, 498, 4411, 279, 3383, 374, 4583, 13, 1416, 279, 16538, 374, 311, 1477, 264, 1467, 5980, 4226, 11, 3410, 279, 4226, 304, 279, 31642, 13, 1416, 498, 4411, 279, 3383, 374, 11997, 311, 4583, 11, 3410, 279, 4226, 438, 1591, 45, 10360, 3014, 304, 279, 31642, 382, 1249, 387, 6849, 11, 432, 374, 1602, 2989, 311, 1795, 279, 2701, 5601, 510, 16, 13, 1446, 1265, 1172, 4265, 458, 1917, 429, 374, 2697, 2661, 279, 1482, 21930, 624, 17, 13, 1446, 1265, 1172, 4265, 825, 1917, 518, 264, 882, 624, 18, 13, 1446, 1265, 1795, 279, 10295, 311, 2874, 3019, 553, 3019, 323, 1221, 4265, 279, 1790, 1917, 624, 19, 13, 1446, 1265, 8300, 311, 13656, 6168, 979, 4265, 458, 1917, 323, 1430, 537, 311, 1281, 58077, 6168, 198, 20, 13, 2009, 32711, 1969, 387, 4766, 30586, 26865, 1472, 26865, 43626, 9492, 11, 323, 1052, 1969, 387, 902, 2550, 1573, 30586, 26865, 1472, 26865, 29, 18639, 21, 13, 4636, 30586, 26865, 1472, 26865, 29, 7808, 1172, 279, 1917, 1265, 387, 7907, 304, 279, 4396, 3561, 11, 43810, 304, 2038, 69155, 13, 1752, 3110, 510, 256, 366, 26865, 41993, 3137, 5868, 9760, 311, 847, 5795, 13, 9189, 287, 432, 1265, 1896, 752, 311, 279, 1790, 3019, 3918, 26865, 397, 256, 54275, 3678, 508, 307, 60, 508, 1796, 60, 13874, 3989, 22, 13, 25226, 279, 2936, 1917, 979, 498, 1744, 498, 614, 16994, 279, 16538, 13, 4320, 1405, 6923, 4113, 1283, 2936, 624, 23, 13, 23240, 3561, 6168, 12440, 25, 715, 73594, 5631, 508, 13786, 60, 13874, 3989, 2461, 3110, 11, 421, 15039, 369, 1591, 61907, 2802, 41612, 304, 279, 2274, 3014, 304, 264, 2711, 2070, 448, 3034, 1565, 17, 16, 7808, 12440, 3561, 432, 438, 510, 73594, 1313, 508, 17, 16, 60, 508, 61907, 2802, 41612, 304, 279, 2274, 60, 508, 16, 60, 13874, 3989, 52116, 15114, 19856, 429, 51044, 38929, 2163, 5029, 476, 24064, 2750, 624, 151645, 198, 151644, 872, 271, 78306, 25, 879, 702, 279, 1429, 3842, 2256, 19212, 24544, 198, 37763, 367, 25, 220, 508, 16, 20, 21, 24, 60, 18854, 5981, 8726, 364, 19284, 20288, 10058, 6, 10735, 25, 3007, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 198, 197, 58, 17, 23, 17, 24, 60, 73999, 330, 5890, 364, 54, 14939, 14913, 2567, 25, 3557, 198, 197, 58, 17, 23, 18, 16, 60, 31300, 3355, 10067, 25, 895, 198, 197, 58, 17, 23, 18, 23, 60, 2656, 364, 10850, 311, 10565, 2150, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 5894, 197, 197, 58, 17, 23, 18, 24, 60, 3137, 364, 145574, 1248, 197, 58, 17, 23, 19, 15, 60, 2656, 330, 10850, 311, 279, 1887, 2150, 315, 364, 54, 14939, 14913, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 5894, 197, 197, 58, 17, 23, 19, 16, 60, 3137, 364, 54, 14939, 1248, 197, 58, 17, 23, 19, 17, 60, 2656, 364, 10850, 311, 264, 26618, 4091, 2150, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 79442, 30, 1796, 63417, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 198, 197, 197, 58, 17, 23, 19, 18, 60, 3137, 364, 147724, 1248, 197, 58, 17, 23, 20, 15, 60, 14496, 364, 19284, 20288, 10058, 6, 702, 24381, 25, 5022, 198, 197, 58, 16, 21, 21, 24, 60, 23105, 1178, 364, 785, 7297, 20288, 10058, 374, 458, 10084, 304, 279, 7885, 197, 58, 17, 23, 20, 20, 60, 2656, 364, 30812, 20761, 8953, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 20290, 1663, 1400, 1905, 3959, 62, 70107, 198, 197, 58, 17, 23, 20, 21, 60, 2656, 364, 19284, 20288, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 198, 197, 58, 16, 21, 22, 20, 60, 23105, 1178, 6256, 8704, 1172, 825, 7885, 197, 58, 17, 23, 20, 22, 60, 2656, 364, 53, 1701, 62818, 36389, 55104, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 27233, 1701, 2351, 2855, 36389, 1139, 57075, 198, 197, 58, 16, 21, 22, 22, 60, 23105, 1178, 364, 374, 21328, 311, 279, 2083, 320, 78147, 8, 5086, 11, 279, 7297, 20288, 10058, 6081, 264, 6530, 480, 296, 75284, 369, 279, 5042, 4217, 323, 2083, 3613, 311, 2506, 369, 5577, 311, 7735, 551, 862, 12560, 23421, 197, 58, 17, 23, 20, 24, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 16, 198, 197, 58, 16, 21, 22, 23, 60, 23105, 1178, 364, 2619, 525, 1083, 24544, 3897, 311, 279, 38280, 5239, 2083, 315, 279, 7297, 20288, 23421, 197, 58, 17, 23, 21, 17, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 17, 198, 197, 58, 17, 22, 24, 16, 60, 2168, 3355, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 38151, 11374, 3466, 1668, 9605, 6859, 43, 34683, 4819, 6411, 79, 198, 197, 58, 16, 21, 22, 24, 60, 23105, 1178, 364, 785, 50455, 7885, 197, 58, 17, 23, 21, 23, 60, 2656, 364, 19284, 20288, 29881, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 6859, 43, 198, 197, 58, 16, 21, 23, 16, 60, 23105, 1178, 364, 10058, 1248, 197, 58, 17, 22, 24, 17, 60, 2168, 3355, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 38151, 14, 22493, 18522, 2142, 1020, 81, 819, 4819, 6411, 79, 198, 197, 58, 17, 23, 22, 17, 60, 2656, 364, 40344, 576, 2142, 1020, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 31701, 4644, 62, 785, 2142, 1020, 198, 197, 58, 17, 23, 22, 18, 60, 2656, 364, 19284, 20288, 52594, 5543, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 6859, 53, 5543, 198, 197, 58, 16, 21, 23, 20, 60, 23105, 1178, 364, 18621, 10058, 320, 1291, 39567, 197, 58, 16, 21, 23, 21, 60, 23105, 1178, 364, 49, 819, 525, 1083, 21328, 311, 3613, 315, 279, 2083, 879, 14816, 279, 63536, 476, 47205, 21553, 23421, 197, 58, 17, 23, 22, 21, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 18, 198, 197, 58, 17, 23, 22, 24, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 19, 198, 197, 58, 17, 23, 23, 17, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 20, 198, 197, 58, 16, 21, 23, 22, 60, 23105, 1178, 364, 576, 12588, 1083, 5707, 55371, 2291, 311, 678, 4217, 438, 1293, 438, 807, 614, 7391, 518, 3245, 2326, 3868, 389, 862, 2083, 748, 4541, 476, 31799, 1140, 26, 279, 19380, 12037, 3220, 374, 76171, 20030, 2878, 264, 2083, 4221, 42706, 11, 59666, 11, 323, 15532, 4217, 23421, 197, 58, 17, 23, 23, 20, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 21, 198, 197, 58, 17, 23, 23, 22, 60, 1874, 11776, 197, 197, 58, 17, 23, 24, 17, 60, 77254, 51942, 364, 7799, 6, 17183, 25, 3007, 198, 298, 197, 58, 17, 23, 24, 18, 60, 14496, 364, 7799, 1248, 197, 197, 58, 16, 21, 24, 15, 60, 23105, 1178, 364, 9485, 24544, 525, 11136, 1865, 315, 7885, 197, 197, 58, 17, 23, 24, 20, 60, 2656, 364, 27869, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 15792, 813, 198, 197, 197, 58, 17, 23, 24, 21, 60, 2656, 364, 24847, 6623, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 19382, 960, 54696, 198, 197, 197, 58, 16, 21, 24, 19, 60, 23105, 1178, 364, 448, 7885, 197, 197, 58, 17, 23, 24, 22, 60, 2656, 364, 88576, 82, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 14953, 18479, 82, 198, 197, 197, 58, 16, 21, 24, 21, 60, 23105, 1178, 6256, 2379, 5990, 2924, 279, 2083, 829, 11, 2083, 12426, 11, 279, 7885, 197, 197, 58, 17, 23, 24, 23, 60, 2656, 364, 27710, 330, 10134, 28808, 20584, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 10270, 3035, 33609, 15774, 643, 3394, 11751, 13488, 1243, 16068, 62, 22372, 62, 23256, 8378, 920, 276, 2584, 2, 10253, 3575, 16068, 66696, 18695, 17, 17, 14615, 4138, 9132, 4, 17, 17, 198, 197, 197, 58, 16, 21, 24, 23, 60, 23105, 1178, 6614, 323, 279, 7297, 20288, 1372, 320, 42966, 16317, 304, 7885, 197, 197, 58, 17, 23, 24, 24, 60, 2656, 364, 60980, 7857, 1127, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 19382, 6908, 4273, 3253, 198, 197, 197, 58, 16, 22, 15, 15, 60, 23105, 1178, 49884, 8999, 24544, 4565, 48051, 304, 279, 6083, 315, 279, 7885, 197, 197, 58, 17, 24, 15, 15, 60, 2656, 364, 53, 1701, 62818, 36389, 55104, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 27233, 1701, 2351, 2855, 36389, 1139, 57075, 198, 197, 197, 58, 16, 22, 15, 17, 60, 23105, 1178, 364, 476, 264, 7885, 197, 197, 58, 17, 22, 23, 20, 60, 2656, 364, 84336, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 10360, 15717, 75759, 3959, 55808, 198, 197, 197, 58, 16, 22, 15, 19, 60, 23105, 1178, 6614, 311, 40368, 279, 1372, 315, 7297, 18436, 4730, 429, 279, 19024, 702, 2765, 23421, 197, 197, 58, 17, 24, 15, 17, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 22, 198, 197, 197, 58, 17, 24, 15, 20, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 23, 198, 197, 197, 58, 16, 22, 15, 21, 60, 23105, 1178, 364, 7496, 24544, 525, 27548, 553, 16035, 93350, 2813, 7885, 197, 197, 58, 17, 24, 15, 22, 60, 2656, 364, 41, 535, 724, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 31701, 535, 724, 198, 197, 197, 58, 16, 22, 15, 23, 60, 23105, 1178, 6256, 576, 24544, 3545, 525, 14279, 10449, 304, 458, 25777, 3745, 323, 5144, 3037, 1142, 23421, 197, 197, 58, 16, 22, 15, 24, 60, 23105, 1178, 364, 785, 10987, 2083, 646, 11136, 3042, 894, 1372, 315, 24544, 311, 420, 635, 423, 807, 5157, 11, 2670, 5990, 11, 714, 537, 7199, 311, 25, 4217, 320, 3028, 21445, 11, 31799, 21445, 11, 476, 7885, 197, 197, 58, 17, 24, 15, 24, 60, 2656, 364, 258, 79391, 20499, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 14, 641, 79391, 89591, 2019, 198, 197, 197, 58, 16, 22, 16, 16, 60, 23105, 1178, 89384, 23956, 11, 50091, 11, 28409, 11, 16849, 11, 323, 4586, 5570, 13, 4329, 7263, 614, 2661, 24544, 311, 4741, 4217, 323, 23956, 429, 1033, 389, 279, 2083, 518, 1045, 1459, 2337, 279, 3200, 11, 8818, 537, 3432, 1012, 389, 279, 10987, 21445, 369, 279, 7297, 20288, 5086, 23421, 39, 35586, 21866, 25, 5872, 1313, 508, 18, 18, 15, 60, 508, 19284, 20288, 24544, 60, 508, 16, 921, 3678, 508, 16, 17, 20, 17, 60, 508, 19284, 20288, 10058, 921, 151645, 198, 151644, 77091, 198, 13708, 766, 1784, 26865, 16357, 1482, 2150, 374, 911, 7297, 20288, 24544, 11, 714, 1558, 537, 3602, 3410, 279, 4226, 311, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 13, 2619, 374, 264, 3772, 44291, 330, 8441, 24544, 4565, 48051, 304, 279, 6083, 315, 279, 55132, 62818, 36389, 55104, 476, 264, 8964, 11, 311, 40368, 279, 1372, 315, 7297, 18436, 4730, 429, 279, 19024, 702, 2765, 1189, 1096, 30643, 518, 3842, 2781, 32897, 11, 438, 16087, 311, 2083, 58184, 13, 2014, 1477, 700, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 11, 358, 1265, 1401, 369, 264, 1140, 476, 6286, 315, 7775, 11, 10767, 553, 37976, 4623, 1495, 311, 1490, 421, 264, 1140, 476, 1965, 315, 3255, 2832, 32120, 7952, 3918, 26865, 397, 522, 26865, 397, 73594, 12605, 508, 2923, 60, 73594, 151645]
+[INFO:swift] [INPUT] <|im_start|>system
+You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.
+Here's the information you'll have:
+The user's objective: This is the task you're trying to complete.
+The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
+The open tabs: These are the tabs you have open.
+The previous actions: There are the actions you just performed. It may be helpful to track your progress.
+The actions you can perform fall into several categories:
+Page Operation Actions:
+`click [id] [content]`: This action clicks on an element with a specific id on the webpage.
+`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the ""Enter"" key is pressed after typing unless press_enter_after is set to 0.
+`hover [id] [content]`: Hover over an element with id.
+`press [key_comb]`:  Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
+`scroll [down|up]`: Scroll the page up or down.
+Tab Management Actions:
+`new_tab`: Open a new, empty browser tab.
+`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.
+`close_tab`: Close the currently active tab.
+URL Navigation Actions:
+`goto [url]`: Navigate to a specific URL.
+`go_back`: Navigate to the previously viewed page.
+`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).
+Completion Action:
+`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as ""N/A"" in the bracket.
+To be successful, it is very important to follow the following rules:
+1. You should only issue an action that is valid given the current observation.
+2. You should only issue one action at a time.
+3. You should follow the examples to reason step by step and then issue the next action.
+4. You should refer to historical actions when issue an action and try not to make repetitive actions
+5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.
+6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:
+   <think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>
+   ```click [id] [content]```
+7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.
+8. Always format actions correctly:
+```command [parameters]```
+For example, if searching for ""death row inmates in the US"" in a search field with ID `21`, correctly format it as:
+```type [21] [death row inmates in the US] [1]```
+Avoid incorrect formats that omit brackets around parameters or numeric values.
+<|im_end|>
+<|im_start|>user
+Objective: who has the most individual super bowl rings
+Observation:  [1569] RootWebArea 'Super Bowl ring' focused: True url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring
+	[2829] textbox "Search 'Wikipedia'" required: False
+	[2831] checkbox '' checked: false
+	[2838] link 'Go to welcome page' url: https://tigerai.ca/
+		[2839] button '🏠'
+	[2840] link "Go to the main page of 'Wikipedia'" url: https://tigerai.ca/wikipedia_en_all_maxi_2022-05/
+		[2841] button 'Wikipedia'
+	[2842] link 'Go to a randomly selected page' url: https://tigerai.ca/random?content=wikipedia_en_all_maxi_2022-05
+		[2843] button '🎲'
+	[2850] heading 'Super Bowl ring' hasPopup: menu
+	[1669] StaticText 'The Super Bowl ring is an award in the '
+	[2855] link 'National Football League' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/National_Football_League
+	[2856] link 'Super Bowl' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl
+	[1675] StaticText '. Since only one '
+	[2857] link 'Vince Lombardi Trophy' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Vince_Lombardi_Trophy
+	[1677] StaticText ' is awarded to the team (ownership) itself, the Super Bowl ring offers a collectable memento for the actual players and team members to keep for themselves to symbolize their victory.'
+	[2859] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-1
+	[1678] StaticText ' There are also rings provided to the runners-up team of the Super Bowl.'
+	[2862] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-2
+	[2791] image '' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/I/Super_Bowl_XL_ring.jpg.webp
+	[1679] StaticText 'The Steelers '
+	[2868] link 'Super Bowl XL' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_XL
+	[1681] StaticText ' ring'
+	[2792] image '' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/I/Joetheismannrings.jpg.webp
+	[2872] link 'Joe Theismann' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Joe_Theismann
+	[2873] link 'Super Bowl XVII' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_XVII
+	[1685] StaticText ' Championship ring (right)'
+	[1686] StaticText 'Rings are also awarded to members of the team who wins the AFC or NFC championship.'
+	[2876] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-3
+	[2879] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-4
+	[2882] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-5
+	[1687] StaticText ' The NFL also provides postseason pay to all players as long as they have spent at least three games on their team’s active or inactive list; the playoff bonus money is egalitarian within a team among starters, backups, and injured players.'
+	[2885] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-6
+	[2887] group ''
+		[2892] DisclosureTriangle 'Details' expanded: True
+			[2893] heading 'Details'
+		[1690] StaticText 'These rings are typically made of '
+		[2895] link 'yellow' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Gold
+		[2896] link 'rose gold' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Rose_gold
+		[1694] StaticText ' with '
+		[2897] link 'diamonds' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Diamonds
+		[1696] StaticText '. They usually include the team name, team logo, the '
+		[2898] link 'phrase "World Champions"' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Major_professional_sports_leagues_in_the_United_States_and_Canada#Use_of_the_phrase_%22world_champions%22
+		[1698] StaticText ', and the Super Bowl number (usually indicated in '
+		[2899] link 'Roman numerals' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Roman_numeral
+		[1700] StaticText '). Many rings feature diamonds in the shape of the '
+		[2900] link 'Vince Lombardi Trophy' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Vince_Lombardi_Trophy
+		[1702] StaticText ' or a '
+		[2785] link 'football' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/American_football_ball
+		[1704] StaticText ', to illustrate the number of Super Bowls that the franchise has won.'
+		[2902] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-7
+		[2905] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-8
+		[1706] StaticText ' Most rings are manufactured by memorabilia company '
+		[2907] link 'Jostens' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Jostens
+		[1708] StaticText '. The rings often are normally presented in an elegant box and/or display case.'
+		[1709] StaticText 'The winning team can typically present any number of rings to whomever they choose, including usually, but not limited to: players (active roster, inactive roster, or '
+		[2909] link 'injured reserve' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Injured_reserve_list
+		[1711] StaticText '), coaches, trainers, executives, personnel, and general staff. Some teams have given rings to former players and coaches that were on the team at some point during the season, despite not having been on the winning roster for the Super Bowl itself.'
+HISTORY_ACTION:
+type [330] [Super Bowl rings] [1]
+click [1252] [Super Bowl ring]
+<|im_end|>
+<|im_start|>assistant
+<think><think>The current page is about Super Bowl rings, but does not yet provide the answer to who has the most individual Super Bowl rings. There is a section mentioning "Many rings feature diamonds in the shape of the Vince Lombardi Trophy or a football, to illustrate the number of Super Bowls that the franchise has won." This hints at individual player achievements, as opposed to team championships. To find out who has the most individual Super Bowl rings, I should look for a list or mention of individuals, possibly by scrolling further down to see if a list or table of record-holders appears.</think>
+</think>
+```scroll [down]```<|im_end|>
+[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 1784, 26865, 16357, 1482, 2150, 374, 911, 7297, 20288, 24544, 11, 714, 1558, 537, 3602, 3410, 279, 4226, 311, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 13, 2619, 374, 264, 3772, 44291, 330, 8441, 24544, 4565, 48051, 304, 279, 6083, 315, 279, 55132, 62818, 36389, 55104, 476, 264, 8964, 11, 311, 40368, 279, 1372, 315, 7297, 18436, 4730, 429, 279, 19024, 702, 2765, 1189, 1096, 30643, 518, 3842, 2781, 32897, 11, 438, 16087, 311, 2083, 58184, 13, 2014, 1477, 700, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 11, 358, 1265, 1401, 369, 264, 1140, 476, 6286, 315, 7775, 11, 10767, 553, 37976, 4623, 1495, 311, 1490, 421, 264, 1140, 476, 1965, 315, 3255, 2832, 32120, 7952, 3918, 26865, 397, 522, 26865, 397, 73594, 12605, 508, 2923, 60, 73594, 151645]
+[INFO:swift] [LABELS] [-100 * 2687]<think><think>The current page is about Super Bowl rings, but does not yet provide the answer to who has the most individual Super Bowl rings. There is a section mentioning "Many rings feature diamonds in the shape of the Vince Lombardi Trophy or a football, to illustrate the number of Super Bowls that the franchise has won." This hints at individual player achievements, as opposed to team championships. To find out who has the most individual Super Bowl rings, I should look for a list or mention of individuals, possibly by scrolling further down to see if a list or table of record-holders appears.</think>
+</think>
+```scroll [down]```<|im_end|>
+[INFO:swift] Dataset Token Length: 2803.201644±911.025599, min=828.000000, max=13246.000000, size=23973
+[INFO:swift] Dataset Token Length: 3108.476190±840.159544, min=1605.000000, max=4793.000000, size=21
+[INFO:swift] The TrainArguments will be saved in: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/args.json
+[INFO:swift] model: Qwen2ForCausalLM(
+  (model): Qwen2Model(
+    (embed_tokens): Embedding(152064, 3584)
+    (layers): ModuleList(
+      (0-27): 28 x Qwen2DecoderLayer(
+        (self_attn): Qwen2Attention(
+          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+        )
+        (mlp): Qwen2MLP(
+          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+        (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
+      )
+    )
+    (norm): Qwen2RMSNorm((0,), eps=1e-06)
+    (rotary_emb): Qwen2RotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
+)
+[INFO:swift] model_parameter_info: Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(
+Detected kernel version 5.4.241, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+[INFO:swift] use_reentrant: True
+[INFO:swift] The logging file will be saved in: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/logging.jsonl
+Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 4. Using DeepSpeed's value.
+Parameter Offload - Persistent parameters statistics: param_count = 141, numel = 333312

log/20250917-13:49:21.log ADDED Viewed

The diff for this file is too large to render. See raw diff

v5-20250917-134655/args.json ADDED Viewed

	@@ -0,0 +1,384 @@

+{
+  "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 5e-06,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 2.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": null,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 2000.0,
+  "dataloader_num_workers": 48,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "channels": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "Qwen/Qwen2.5-7B-Instruct",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 16240,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.001,
+  "dataset_num_proc": 100,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": false,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": false,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "Qwen2.5-7B-Instruct",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fbf902b9ab0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

v5-20250917-134655/images/train_epoch.png ADDED Viewed

v5-20250917-134655/images/train_grad_norm.png ADDED Viewed

v5-20250917-134655/images/train_learning_rate.png ADDED Viewed

v5-20250917-134655/images/train_loss.png ADDED Viewed

v5-20250917-134655/images/train_token_acc.png ADDED Viewed

v5-20250917-134655/logging.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"loss": 1.27844775, "grad_norm": 25.99561637, "learning_rate": 1.3e-07, "token_acc": 0.73113072, "epoch": 0.00267023, "global_step/max_steps": "1/750", "percentage": "0.13%", "elapsed_time": "22s", "remaining_time": "4h 39m 20s", "memory(GiB)": 25.14, "train_speed(iter/s)": 0.044689}
+{"loss": 1.27462864, "grad_norm": 25.99462207, "learning_rate": 2.6e-07, "token_acc": 0.72517288, "epoch": 0.00534045, "global_step/max_steps": "2/750", "percentage": "0.27%", "elapsed_time": "49s", "remaining_time": "5h 8m 18s", "memory(GiB)": 26.61, "train_speed(iter/s)": 0.040436}
+{"train_dataset": "2803.201644±911.025599, min=828.000000, max=13246.000000, size=23973", "val_dataset": "3108.476190±840.159544, min=1605.000000, max=4793.000000, size=21", "model_parameter_info": "Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.", "last_model_checkpoint": null, "best_model_checkpoint": null, "best_metric": null, "global_step": 2, "log_history": [{"loss": 1.2784477472305298, "grad_norm": 25.995616372803973, "learning_rate": 1.3157894736842107e-07, "token_acc": 0.7311307191848755, "epoch": 0.0026702269692923898, "step": 1}, {"loss": 1.2746286392211914, "grad_norm": 25.994622070063752, "learning_rate": 2.6315789473684213e-07, "token_acc": 0.7251728773117065, "epoch": 0.0053404539385847796, "step": 2}], "memory": 26.607421875}

v5-20250917-134655/runs/events.out.tfevents.1758088071.TENCENT64.site.218247.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ae03fcdb066801b87d2feb5ea699f32778b8d57b4645456cf7f8b52e9c6bace
+size 8344

v5-20250917-134655/val_dataset.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

v6-20250917-134949/args.json ADDED Viewed

	@@ -0,0 +1,384 @@

+{
+  "output_dir": "/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 5e-06,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 2.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": null,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 2000.0,
+  "dataloader_num_workers": 48,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "channels": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "Qwen/Qwen2.5-7B-Instruct",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 16240,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl",
+    "/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.001,
+  "dataset_num_proc": 100,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": false,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": false,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "Qwen2.5-7B-Instruct",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f18b0b01ab0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

v6-20250917-134949/logging.jsonl ADDED Viewed

	@@ -0,0 +1,171 @@

+{"loss": 1.27844775, "grad_norm": 25.99517388, "learning_rate": 1.3e-07, "token_acc": 0.73113072, "epoch": 0.00267023, "global_step/max_steps": "1/750", "percentage": "0.13%", "elapsed_time": "22s", "remaining_time": "4h 35m 7s", "memory(GiB)": 25.14, "train_speed(iter/s)": 0.045372}
+{"loss": 1.27462864, "grad_norm": 26.01345823, "learning_rate": 2.6e-07, "token_acc": 0.72517288, "epoch": 0.00534045, "global_step/max_steps": "2/750", "percentage": "0.27%", "elapsed_time": "49s", "remaining_time": "5h 6m 9s", "memory(GiB)": 26.61, "train_speed(iter/s)": 0.040719}
+{"loss": 1.39245546, "grad_norm": 27.78962546, "learning_rate": 3.9e-07, "token_acc": 0.69932902, "epoch": 0.00801068, "global_step/max_steps": "3/750", "percentage": "0.40%", "elapsed_time": "1m 14s", "remaining_time": "5h 7m 12s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.040526}
+{"loss": 1.26101208, "grad_norm": 25.88373791, "learning_rate": 5.3e-07, "token_acc": 0.72536421, "epoch": 0.01068091, "global_step/max_steps": "4/750", "percentage": "0.53%", "elapsed_time": "1m 35s", "remaining_time": "4h 57m 46s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.041755}
+{"loss": 1.30307007, "grad_norm": 27.26872908, "learning_rate": 6.6e-07, "token_acc": 0.72423238, "epoch": 0.01335113, "global_step/max_steps": "5/750", "percentage": "0.67%", "elapsed_time": "1m 55s", "remaining_time": "4h 47m 25s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.043201}
+{"loss": 1.22360718, "grad_norm": 25.55179448, "learning_rate": 7.9e-07, "token_acc": 0.7291103, "epoch": 0.01602136, "global_step/max_steps": "6/750", "percentage": "0.80%", "elapsed_time": "2m 17s", "remaining_time": "4h 45m 3s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.0435}
+{"loss": 1.24122119, "grad_norm": 23.07582775, "learning_rate": 9.2e-07, "token_acc": 0.72044706, "epoch": 0.01869159, "global_step/max_steps": "7/750", "percentage": "0.93%", "elapsed_time": "2m 39s", "remaining_time": "4h 42m 38s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.043812}
+{"loss": 1.14965189, "grad_norm": 22.20997164, "learning_rate": 1.05e-06, "token_acc": 0.72885573, "epoch": 0.02136182, "global_step/max_steps": "8/750", "percentage": "1.07%", "elapsed_time": "3m 0s", "remaining_time": "4h 39m 45s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.044206}
+{"loss": 1.10636055, "grad_norm": 20.4945993, "learning_rate": 1.18e-06, "token_acc": 0.74439591, "epoch": 0.02403204, "global_step/max_steps": "9/750", "percentage": "1.20%", "elapsed_time": "3m 24s", "remaining_time": "4h 40m 40s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.044002}
+{"loss": 1.02117717, "grad_norm": 13.49092053, "learning_rate": 1.32e-06, "token_acc": 0.7293604, "epoch": 0.02670227, "global_step/max_steps": "10/750", "percentage": "1.33%", "elapsed_time": "3m 51s", "remaining_time": "4h 45m 47s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.043155}
+{"loss": 0.91030097, "grad_norm": 11.63646161, "learning_rate": 1.45e-06, "token_acc": 0.75036925, "epoch": 0.0293725, "global_step/max_steps": "11/750", "percentage": "1.47%", "elapsed_time": "4m 14s", "remaining_time": "4h 45m 7s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.043197}
+{"loss": 0.94025064, "grad_norm": 10.67731611, "learning_rate": 1.58e-06, "token_acc": 0.73384029, "epoch": 0.03204272, "global_step/max_steps": "12/750", "percentage": "1.60%", "elapsed_time": "4m 53s", "remaining_time": "5h 1m 1s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.040859}
+{"loss": 0.88358194, "grad_norm": 8.83214012, "learning_rate": 1.71e-06, "token_acc": 0.74344194, "epoch": 0.03471295, "global_step/max_steps": "13/750", "percentage": "1.73%", "elapsed_time": "5m 18s", "remaining_time": "5h 0m 35s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.040864}
+{"loss": 0.77978659, "grad_norm": 8.46470553, "learning_rate": 1.84e-06, "token_acc": 0.76530904, "epoch": 0.03738318, "global_step/max_steps": "14/750", "percentage": "1.87%", "elapsed_time": "5m 40s", "remaining_time": "4h 58m 38s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041076}
+{"loss": 0.77927828, "grad_norm": 7.27100215, "learning_rate": 1.97e-06, "token_acc": 0.77147579, "epoch": 0.0400534, "global_step/max_steps": "15/750", "percentage": "2.00%", "elapsed_time": "6m 5s", "remaining_time": "4h 58m 52s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.040986}
+{"loss": 0.84693223, "grad_norm": 7.18134719, "learning_rate": 2.11e-06, "token_acc": 0.75283021, "epoch": 0.04272363, "global_step/max_steps": "16/750", "percentage": "2.13%", "elapsed_time": "6m 26s", "remaining_time": "4h 55m 47s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041358}
+{"loss": 0.78003627, "grad_norm": 5.70513852, "learning_rate": 2.24e-06, "token_acc": 0.77384925, "epoch": 0.04539386, "global_step/max_steps": "17/750", "percentage": "2.27%", "elapsed_time": "6m 45s", "remaining_time": "4h 51m 7s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041965}
+{"loss": 0.78369838, "grad_norm": 5.66400262, "learning_rate": 2.37e-06, "token_acc": 0.77144468, "epoch": 0.04806409, "global_step/max_steps": "18/750", "percentage": "2.40%", "elapsed_time": "7m 8s", "remaining_time": "4h 50m 40s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041972}
+{"loss": 0.71311992, "grad_norm": 5.2084166, "learning_rate": 2.5e-06, "token_acc": 0.78415757, "epoch": 0.05073431, "global_step/max_steps": "19/750", "percentage": "2.53%", "elapsed_time": "7m 30s", "remaining_time": "4h 48m 36s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042214}
+{"loss": 0.67186064, "grad_norm": 4.74874935, "learning_rate": 2.63e-06, "token_acc": 0.80047834, "epoch": 0.05340454, "global_step/max_steps": "20/750", "percentage": "2.67%", "elapsed_time": "7m 55s", "remaining_time": "4h 49m 11s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042071}
+{"loss": 0.69137228, "grad_norm": 4.78263632, "learning_rate": 2.76e-06, "token_acc": 0.79207921, "epoch": 0.05607477, "global_step/max_steps": "21/750", "percentage": "2.80%", "elapsed_time": "8m 17s", "remaining_time": "4h 47m 46s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.04222}
+{"loss": 0.64969599, "grad_norm": 4.49963775, "learning_rate": 2.89e-06, "token_acc": 0.80375814, "epoch": 0.05874499, "global_step/max_steps": "22/750", "percentage": "2.93%", "elapsed_time": "8m 39s", "remaining_time": "4h 46m 23s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042366}
+{"loss": 0.697088, "grad_norm": 4.21276913, "learning_rate": 3.03e-06, "token_acc": 0.78882074, "epoch": 0.06141522, "global_step/max_steps": "23/750", "percentage": "3.07%", "elapsed_time": "9m 3s", "remaining_time": "4h 46m 12s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042335}
+{"loss": 0.66172689, "grad_norm": 4.00953554, "learning_rate": 3.16e-06, "token_acc": 0.7970311, "epoch": 0.06408545, "global_step/max_steps": "24/750", "percentage": "3.20%", "elapsed_time": "9m 26s", "remaining_time": "4h 45m 41s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042352}
+{"loss": 0.665833, "grad_norm": 4.67415643, "learning_rate": 3.29e-06, "token_acc": 0.79373443, "epoch": 0.06675567, "global_step/max_steps": "25/750", "percentage": "3.33%", "elapsed_time": "9m 50s", "remaining_time": "4h 45m 14s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042362}
+{"loss": 0.60853577, "grad_norm": 4.33591965, "learning_rate": 3.42e-06, "token_acc": 0.80888575, "epoch": 0.0694259, "global_step/max_steps": "26/750", "percentage": "3.47%", "elapsed_time": "10m 11s", "remaining_time": "4h 43m 48s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042516}
+{"loss": 0.68158579, "grad_norm": 4.57626334, "learning_rate": 3.55e-06, "token_acc": 0.79410064, "epoch": 0.07209613, "global_step/max_steps": "27/750", "percentage": "3.60%", "elapsed_time": "10m 31s", "remaining_time": "4h 41m 51s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042753}
+{"loss": 0.57617843, "grad_norm": 3.84859446, "learning_rate": 3.68e-06, "token_acc": 0.8145172, "epoch": 0.07476636, "global_step/max_steps": "28/750", "percentage": "3.73%", "elapsed_time": "10m 54s", "remaining_time": "4h 41m 13s", "memory(GiB)": 49.15, "train_speed(iter/s)": 0.042789}
+{"loss": 0.58978891, "grad_norm": 3.78972326, "learning_rate": 3.82e-06, "token_acc": 0.81657141, "epoch": 0.07743658, "global_step/max_steps": "29/750", "percentage": "3.87%", "elapsed_time": "11m 16s", "remaining_time": "4h 40m 14s", "memory(GiB)": 49.15, "train_speed(iter/s)": 0.042879}
+{"loss": 0.65006042, "grad_norm": 3.80896471, "learning_rate": 3.95e-06, "token_acc": 0.79703522, "epoch": 0.08010681, "global_step/max_steps": "30/750", "percentage": "4.00%", "elapsed_time": "11m 41s", "remaining_time": "4h 40m 38s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04276}
+{"loss": 0.56551218, "grad_norm": 3.48714797, "learning_rate": 4.08e-06, "token_acc": 0.81890911, "epoch": 0.08277704, "global_step/max_steps": "31/750", "percentage": "4.13%", "elapsed_time": "12m 0s", "remaining_time": "4h 38m 39s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043004}
+{"loss": 0.56460661, "grad_norm": 3.46457404, "learning_rate": 4.21e-06, "token_acc": 0.82258999, "epoch": 0.08544726, "global_step/max_steps": "32/750", "percentage": "4.27%", "elapsed_time": "12m 20s", "remaining_time": "4h 36m 47s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043233}
+{"loss": 0.60303181, "grad_norm": 3.22141669, "learning_rate": 4.34e-06, "token_acc": 0.80737174, "epoch": 0.08811749, "global_step/max_steps": "33/750", "percentage": "4.40%", "elapsed_time": "12m 42s", "remaining_time": "4h 36m 2s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04329}
+{"loss": 0.57607186, "grad_norm": 3.21232272, "learning_rate": 4.47e-06, "token_acc": 0.81398487, "epoch": 0.09078772, "global_step/max_steps": "34/750", "percentage": "4.53%", "elapsed_time": "13m 7s", "remaining_time": "4h 36m 19s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043186}
+{"loss": 0.60476643, "grad_norm": 3.3457971, "learning_rate": 4.61e-06, "token_acc": 0.80856991, "epoch": 0.09345794, "global_step/max_steps": "35/750", "percentage": "4.67%", "elapsed_time": "13m 28s", "remaining_time": "4h 35m 22s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043273}
+{"loss": 0.58224332, "grad_norm": 3.21682624, "learning_rate": 4.74e-06, "token_acc": 0.81014532, "epoch": 0.09612817, "global_step/max_steps": "36/750", "percentage": "4.80%", "elapsed_time": "13m 52s", "remaining_time": "4h 35m 12s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043241}
+{"loss": 0.55732399, "grad_norm": 3.22129243, "learning_rate": 4.87e-06, "token_acc": 0.82355452, "epoch": 0.0987984, "global_step/max_steps": "37/750", "percentage": "4.93%", "elapsed_time": "14m 12s", "remaining_time": "4h 33m 44s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043412}
+{"loss": 0.51289082, "grad_norm": 3.11871556, "learning_rate": 5e-06, "token_acc": 0.82989198, "epoch": 0.10146862, "global_step/max_steps": "38/750", "percentage": "5.07%", "elapsed_time": "14m 37s", "remaining_time": "4h 33m 54s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043324}
+{"loss": 0.55835736, "grad_norm": 3.41305042, "learning_rate": 5e-06, "token_acc": 0.81758821, "epoch": 0.10413885, "global_step/max_steps": "39/750", "percentage": "5.20%", "elapsed_time": "14m 57s", "remaining_time": "4h 32m 38s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043465}
+{"loss": 0.55718553, "grad_norm": 3.10799851, "learning_rate": 5e-06, "token_acc": 0.82248998, "epoch": 0.10680908, "global_step/max_steps": "40/750", "percentage": "5.33%", "elapsed_time": "15m 18s", "remaining_time": "4h 31m 45s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043545}
+{"loss": 0.5457294, "grad_norm": 2.96653442, "learning_rate": 5e-06, "token_acc": 0.83149326, "epoch": 0.10947931, "global_step/max_steps": "41/750", "percentage": "5.47%", "elapsed_time": "15m 41s", "remaining_time": "4h 31m 27s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04353}
+{"loss": 0.60883057, "grad_norm": 3.02415036, "learning_rate": 5e-06, "token_acc": 0.80839103, "epoch": 0.11214953, "global_step/max_steps": "42/750", "percentage": "5.60%", "elapsed_time": "16m 2s", "remaining_time": "4h 30m 26s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043633}
+{"loss": 0.5560565, "grad_norm": 2.80719127, "learning_rate": 5e-06, "token_acc": 0.8238644, "epoch": 0.11481976, "global_step/max_steps": "43/750", "percentage": "5.73%", "elapsed_time": "16m 22s", "remaining_time": "4h 29m 18s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043755}
+{"loss": 0.50780284, "grad_norm": 2.90999056, "learning_rate": 5e-06, "token_acc": 0.83437282, "epoch": 0.11748999, "global_step/max_steps": "44/750", "percentage": "5.87%", "elapsed_time": "16m 44s", "remaining_time": "4h 28m 44s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043785}
+{"loss": 0.49909928, "grad_norm": 2.96779505, "learning_rate": 5e-06, "token_acc": 0.83670205, "epoch": 0.12016021, "global_step/max_steps": "45/750", "percentage": "6.00%", "elapsed_time": "17m 3s", "remaining_time": "4h 27m 21s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04395}
+{"loss": 0.59060061, "grad_norm": 2.98663031, "learning_rate": 5e-06, "token_acc": 0.81317079, "epoch": 0.12283044, "global_step/max_steps": "46/750", "percentage": "6.13%", "elapsed_time": "17m 28s", "remaining_time": "4h 27m 23s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043879}
+{"loss": 0.50838113, "grad_norm": 2.86363356, "learning_rate": 5e-06, "token_acc": 0.83875698, "epoch": 0.12550067, "global_step/max_steps": "47/750", "percentage": "6.27%", "elapsed_time": "17m 53s", "remaining_time": "4h 27m 33s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043791}
+{"loss": 0.4888871, "grad_norm": 2.85946941, "learning_rate": 5e-06, "token_acc": 0.8355937, "epoch": 0.12817089, "global_step/max_steps": "48/750", "percentage": "6.40%", "elapsed_time": "18m 13s", "remaining_time": "4h 26m 39s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043876}
+{"loss": 0.54882491, "grad_norm": 2.9982353, "learning_rate": 5e-06, "token_acc": 0.82041854, "epoch": 0.13084112, "global_step/max_steps": "49/750", "percentage": "6.53%", "elapsed_time": "18m 43s", "remaining_time": "4h 27m 54s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043609}
+{"loss": 0.50211453, "grad_norm": 2.84865014, "learning_rate": 5e-06, "token_acc": 0.83424717, "epoch": 0.13351135, "global_step/max_steps": "50/750", "percentage": "6.67%", "elapsed_time": "19m 3s", "remaining_time": "4h 26m 50s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043722}
+{"loss": 0.48680127, "grad_norm": 2.95464195, "learning_rate": 5e-06, "token_acc": 0.84170783, "epoch": 0.13618158, "global_step/max_steps": "51/750", "percentage": "6.80%", "elapsed_time": "19m 26s", "remaining_time": "4h 26m 31s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043711}
+{"loss": 0.55014861, "grad_norm": 2.775706, "learning_rate": 5e-06, "token_acc": 0.8193754, "epoch": 0.1388518, "global_step/max_steps": "52/750", "percentage": "6.93%", "elapsed_time": "19m 49s", "remaining_time": "4h 26m 13s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043698}
+{"loss": 0.56171989, "grad_norm": 2.8737905, "learning_rate": 4.99e-06, "token_acc": 0.82004064, "epoch": 0.14152203, "global_step/max_steps": "53/750", "percentage": "7.07%", "elapsed_time": "20m 10s", "remaining_time": "4h 25m 21s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043778}
+{"loss": 0.45519453, "grad_norm": 2.8556276, "learning_rate": 4.99e-06, "token_acc": 0.8451218, "epoch": 0.14419226, "global_step/max_steps": "54/750", "percentage": "7.20%", "elapsed_time": "20m 34s", "remaining_time": "4h 25m 9s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043747}
+{"loss": 0.51091546, "grad_norm": 2.99209119, "learning_rate": 4.99e-06, "token_acc": 0.8289665, "epoch": 0.14686248, "global_step/max_steps": "55/750", "percentage": "7.33%", "elapsed_time": "20m 52s", "remaining_time": "4h 23m 42s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043926}
+{"loss": 0.48474944, "grad_norm": 2.78579951, "learning_rate": 4.99e-06, "token_acc": 0.83626682, "epoch": 0.14953271, "global_step/max_steps": "56/750", "percentage": "7.47%", "elapsed_time": "21m 10s", "remaining_time": "4h 22m 29s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.044066}
+{"loss": 0.4932918, "grad_norm": 2.65078186, "learning_rate": 4.99e-06, "token_acc": 0.83660716, "epoch": 0.15220294, "global_step/max_steps": "57/750", "percentage": "7.60%", "elapsed_time": "21m 48s", "remaining_time": "4h 25m 6s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043567}
+{"loss": 0.50794923, "grad_norm": 3.02283834, "learning_rate": 4.99e-06, "token_acc": 0.83110255, "epoch": 0.15487316, "global_step/max_steps": "58/750", "percentage": "7.73%", "elapsed_time": "22m 12s", "remaining_time": "4h 25m 0s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04352}
+{"loss": 0.49793923, "grad_norm": 2.69818223, "learning_rate": 4.99e-06, "token_acc": 0.83278316, "epoch": 0.15754339, "global_step/max_steps": "59/750", "percentage": "7.87%", "elapsed_time": "22m 33s", "remaining_time": "4h 24m 12s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043589}
+{"loss": 0.54689407, "grad_norm": 2.97761434, "learning_rate": 4.99e-06, "token_acc": 0.8215152, "epoch": 0.16021362, "global_step/max_steps": "60/750", "percentage": "8.00%", "elapsed_time": "22m 56s", "remaining_time": "4h 23m 45s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.0436}
+{"loss": 0.47106403, "grad_norm": 3.09528933, "learning_rate": 4.99e-06, "token_acc": 0.8421275, "epoch": 0.16288385, "global_step/max_steps": "61/750", "percentage": "8.13%", "elapsed_time": "23m 18s", "remaining_time": "4h 23m 11s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043632}
+{"loss": 0.51365429, "grad_norm": 2.91650805, "learning_rate": 4.99e-06, "token_acc": 0.83015621, "epoch": 0.16555407, "global_step/max_steps": "62/750", "percentage": "8.27%", "elapsed_time": "23m 43s", "remaining_time": "4h 23m 17s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043551}
+{"loss": 0.50668406, "grad_norm": 2.8659642, "learning_rate": 4.98e-06, "token_acc": 0.83339614, "epoch": 0.1682243, "global_step/max_steps": "63/750", "percentage": "8.40%", "elapsed_time": "24m 19s", "remaining_time": "4h 25m 18s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043157}
+{"loss": 0.48768979, "grad_norm": 2.62879961, "learning_rate": 4.98e-06, "token_acc": 0.84143567, "epoch": 0.17089453, "global_step/max_steps": "64/750", "percentage": "8.53%", "elapsed_time": "24m 49s", "remaining_time": "4h 26m 0s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042982}
+{"loss": 0.47808069, "grad_norm": 2.7667854, "learning_rate": 4.98e-06, "token_acc": 0.84517378, "epoch": 0.17356475, "global_step/max_steps": "65/750", "percentage": "8.67%", "elapsed_time": "25m 7s", "remaining_time": "4h 24m 45s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043122}
+{"loss": 0.51385736, "grad_norm": 2.66850491, "learning_rate": 4.98e-06, "token_acc": 0.8314994, "epoch": 0.17623498, "global_step/max_steps": "66/750", "percentage": "8.80%", "elapsed_time": "25m 28s", "remaining_time": "4h 24m 5s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043167}
+{"loss": 0.46788478, "grad_norm": 2.90791281, "learning_rate": 4.98e-06, "token_acc": 0.84539282, "epoch": 0.17890521, "global_step/max_steps": "67/750", "percentage": "8.93%", "elapsed_time": "25m 51s", "remaining_time": "4h 23m 32s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043193}
+{"loss": 0.49202442, "grad_norm": 2.85655592, "learning_rate": 4.98e-06, "token_acc": 0.84020692, "epoch": 0.18157543, "global_step/max_steps": "68/750", "percentage": "9.07%", "elapsed_time": "26m 15s", "remaining_time": "4h 23m 19s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043165}
+{"loss": 0.49528018, "grad_norm": 2.83720613, "learning_rate": 4.98e-06, "token_acc": 0.83596516, "epoch": 0.18424566, "global_step/max_steps": "69/750", "percentage": "9.20%", "elapsed_time": "26m 36s", "remaining_time": "4h 22m 36s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043219}
+{"loss": 0.49208367, "grad_norm": 2.76896128, "learning_rate": 4.98e-06, "token_acc": 0.8359322, "epoch": 0.18691589, "global_step/max_steps": "70/750", "percentage": "9.33%", "elapsed_time": "26m 58s", "remaining_time": "4h 21m 59s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043258}
+{"loss": 0.5072571, "grad_norm": 2.68663506, "learning_rate": 4.97e-06, "token_acc": 0.83775103, "epoch": 0.18958611, "global_step/max_steps": "71/750", "percentage": "9.47%", "elapsed_time": "27m 37s", "remaining_time": "4h 24m 13s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04283}
+{"loss": 0.4340511, "grad_norm": 2.47670799, "learning_rate": 4.97e-06, "token_acc": 0.85182673, "epoch": 0.19225634, "global_step/max_steps": "72/750", "percentage": "9.60%", "elapsed_time": "27m 59s", "remaining_time": "4h 23m 39s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042857}
+{"loss": 0.42714188, "grad_norm": 2.57520246, "learning_rate": 4.97e-06, "token_acc": 0.85724175, "epoch": 0.19492657, "global_step/max_steps": "73/750", "percentage": "9.73%", "elapsed_time": "28m 19s", "remaining_time": "4h 22m 42s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042951}
+{"loss": 0.50578225, "grad_norm": 2.99044505, "learning_rate": 4.97e-06, "token_acc": 0.83729088, "epoch": 0.1975968, "global_step/max_steps": "74/750", "percentage": "9.87%", "elapsed_time": "28m 44s", "remaining_time": "4h 22m 35s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042906}
+{"loss": 0.51055467, "grad_norm": 3.09458818, "learning_rate": 4.97e-06, "token_acc": 0.82715166, "epoch": 0.20026702, "global_step/max_steps": "75/750", "percentage": "10.00%", "elapsed_time": "29m 7s", "remaining_time": "4h 22m 4s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042927}
+{"loss": 0.45781508, "grad_norm": 2.63747937, "learning_rate": 4.96e-06, "token_acc": 0.85162896, "epoch": 0.20293725, "global_step/max_steps": "76/750", "percentage": "10.13%", "elapsed_time": "29m 34s", "remaining_time": "4h 22m 18s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042826}
+{"loss": 0.4840948, "grad_norm": 2.71359866, "learning_rate": 4.96e-06, "token_acc": 0.83874184, "epoch": 0.20560748, "global_step/max_steps": "77/750", "percentage": "10.27%", "elapsed_time": "29m 59s", "remaining_time": "4h 22m 8s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042789}
+{"loss": 0.45613506, "grad_norm": 2.46367457, "learning_rate": 4.96e-06, "token_acc": 0.84933531, "epoch": 0.2082777, "global_step/max_steps": "78/750", "percentage": "10.40%", "elapsed_time": "30m 23s", "remaining_time": "4h 21m 46s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042784}
+{"loss": 0.47574463, "grad_norm": 2.68293019, "learning_rate": 4.96e-06, "token_acc": 0.84153438, "epoch": 0.21094793, "global_step/max_steps": "79/750", "percentage": "10.53%", "elapsed_time": "30m 48s", "remaining_time": "4h 21m 37s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042745}
+{"loss": 0.52936327, "grad_norm": 2.77941479, "learning_rate": 4.96e-06, "token_acc": 0.82768434, "epoch": 0.21361816, "global_step/max_steps": "80/750", "percentage": "10.67%", "elapsed_time": "31m 11s", "remaining_time": "4h 21m 16s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04274}
+{"loss": 0.4498612, "grad_norm": 2.49701486, "learning_rate": 4.96e-06, "token_acc": 0.84986889, "epoch": 0.21628838, "global_step/max_steps": "81/750", "percentage": "10.80%", "elapsed_time": "31m 33s", "remaining_time": "4h 20m 42s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042768}
+{"loss": 0.45617718, "grad_norm": 2.66215104, "learning_rate": 4.95e-06, "token_acc": 0.84527934, "epoch": 0.21895861, "global_step/max_steps": "82/750", "percentage": "10.93%", "elapsed_time": "31m 55s", "remaining_time": "4h 20m 5s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042807}
+{"loss": 0.49068668, "grad_norm": 2.83164211, "learning_rate": 4.95e-06, "token_acc": 0.83319885, "epoch": 0.22162884, "global_step/max_steps": "83/750", "percentage": "11.07%", "elapsed_time": "32m 16s", "remaining_time": "4h 19m 25s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042851}
+{"loss": 0.47080141, "grad_norm": 2.62070172, "learning_rate": 4.95e-06, "token_acc": 0.84347826, "epoch": 0.22429907, "global_step/max_steps": "84/750", "percentage": "11.20%", "elapsed_time": "32m 38s", "remaining_time": "4h 18m 50s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042884}
+{"loss": 0.53563607, "grad_norm": 2.9430949, "learning_rate": 4.95e-06, "token_acc": 0.82353771, "epoch": 0.22696929, "global_step/max_steps": "85/750", "percentage": "11.33%", "elapsed_time": "33m 2s", "remaining_time": "4h 18m 28s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042879}
+{"loss": 0.47248948, "grad_norm": 2.86898516, "learning_rate": 4.94e-06, "token_acc": 0.84338623, "epoch": 0.22963952, "global_step/max_steps": "86/750", "percentage": "11.47%", "elapsed_time": "33m 26s", "remaining_time": "4h 18m 9s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042867}
+{"loss": 0.46606526, "grad_norm": 2.56997683, "learning_rate": 4.94e-06, "token_acc": 0.84610194, "epoch": 0.23230975, "global_step/max_steps": "87/750", "percentage": "11.60%", "elapsed_time": "33m 49s", "remaining_time": "4h 17m 43s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042875}
+{"loss": 0.50277519, "grad_norm": 2.7458057, "learning_rate": 4.94e-06, "token_acc": 0.83897567, "epoch": 0.23497997, "global_step/max_steps": "88/750", "percentage": "11.73%", "elapsed_time": "34m 10s", "remaining_time": "4h 17m 9s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042906}
+{"loss": 0.4936814, "grad_norm": 2.67654973, "learning_rate": 4.94e-06, "token_acc": 0.83582497, "epoch": 0.2376502, "global_step/max_steps": "89/750", "percentage": "11.87%", "elapsed_time": "34m 33s", "remaining_time": "4h 16m 38s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042926}
+{"loss": 0.49069792, "grad_norm": 2.65997853, "learning_rate": 4.93e-06, "token_acc": 0.83801562, "epoch": 0.24032043, "global_step/max_steps": "90/750", "percentage": "12.00%", "elapsed_time": "34m 55s", "remaining_time": "4h 16m 7s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042948}
+{"loss": 0.5154832, "grad_norm": 2.66841192, "learning_rate": 4.93e-06, "token_acc": 0.8292045, "epoch": 0.24299065, "global_step/max_steps": "91/750", "percentage": "12.13%", "elapsed_time": "35m 18s", "remaining_time": "4h 15m 40s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042958}
+{"loss": 0.4774642, "grad_norm": 2.76984675, "learning_rate": 4.93e-06, "token_acc": 0.84101444, "epoch": 0.24566088, "global_step/max_steps": "92/750", "percentage": "12.27%", "elapsed_time": "35m 43s", "remaining_time": "4h 15m 33s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042914}
+{"loss": 0.51443124, "grad_norm": 2.6295351, "learning_rate": 4.93e-06, "token_acc": 0.83289403, "epoch": 0.24833111, "global_step/max_steps": "93/750", "percentage": "12.40%", "elapsed_time": "36m 8s", "remaining_time": "4h 15m 19s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042886}
+{"loss": 0.48028237, "grad_norm": 2.73234039, "learning_rate": 4.92e-06, "token_acc": 0.84037942, "epoch": 0.25100134, "global_step/max_steps": "94/750", "percentage": "12.53%", "elapsed_time": "36m 27s", "remaining_time": "4h 14m 23s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042978}
+{"loss": 0.46484447, "grad_norm": 2.4100259, "learning_rate": 4.92e-06, "token_acc": 0.84531331, "epoch": 0.25367156, "global_step/max_steps": "95/750", "percentage": "12.67%", "elapsed_time": "36m 47s", "remaining_time": "4h 13m 37s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043041}
+{"loss": 0.42639616, "grad_norm": 2.531642, "learning_rate": 4.92e-06, "token_acc": 0.85547274, "epoch": 0.25634179, "global_step/max_steps": "96/750", "percentage": "12.80%", "elapsed_time": "37m 10s", "remaining_time": "4h 13m 17s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043033}
+{"loss": 0.43437934, "grad_norm": 2.71661515, "learning_rate": 4.92e-06, "token_acc": 0.852817, "epoch": 0.25901202, "global_step/max_steps": "97/750", "percentage": "12.93%", "elapsed_time": "37m 32s", "remaining_time": "4h 12m 40s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043071}
+{"loss": 0.46744508, "grad_norm": 2.69671582, "learning_rate": 4.91e-06, "token_acc": 0.84636527, "epoch": 0.26168224, "global_step/max_steps": "98/750", "percentage": "13.07%", "elapsed_time": "37m 54s", "remaining_time": "4h 12m 11s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04309}
+{"loss": 0.45774141, "grad_norm": 2.97757487, "learning_rate": 4.91e-06, "token_acc": 0.85031039, "epoch": 0.26435247, "global_step/max_steps": "99/750", "percentage": "13.20%", "elapsed_time": "38m 15s", "remaining_time": "4h 11m 35s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043126}
+{"loss": 0.45865607, "grad_norm": 2.51920976, "learning_rate": 4.91e-06, "token_acc": 0.85102785, "epoch": 0.2670227, "global_step/max_steps": "100/750", "percentage": "13.33%", "elapsed_time": "38m 40s", "remaining_time": "4h 11m 21s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.0431}
+{"loss": 0.47016156, "grad_norm": 2.781076, "learning_rate": 4.9e-06, "token_acc": 0.84529811, "epoch": 0.26969292, "global_step/max_steps": "101/750", "percentage": "13.47%", "elapsed_time": "39m 9s", "remaining_time": "4h 11m 39s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042981}
+{"loss": 0.47470731, "grad_norm": 2.96908552, "learning_rate": 4.9e-06, "token_acc": 0.83988202, "epoch": 0.27236315, "global_step/max_steps": "102/750", "percentage": "13.60%", "elapsed_time": "39m 30s", "remaining_time": "4h 11m 0s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043026}
+{"loss": 0.45952839, "grad_norm": 2.77982348, "learning_rate": 4.9e-06, "token_acc": 0.84351736, "epoch": 0.27503338, "global_step/max_steps": "103/750", "percentage": "13.73%", "elapsed_time": "39m 50s", "remaining_time": "4h 10m 16s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043086}
+{"loss": 0.46661314, "grad_norm": 2.64092945, "learning_rate": 4.89e-06, "token_acc": 0.84133554, "epoch": 0.2777036, "global_step/max_steps": "104/750", "percentage": "13.87%", "elapsed_time": "40m 12s", "remaining_time": "4h 9m 43s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043114}
+{"loss": 0.47708511, "grad_norm": 2.66905977, "learning_rate": 4.89e-06, "token_acc": 0.84014326, "epoch": 0.28037383, "global_step/max_steps": "105/750", "percentage": "14.00%", "elapsed_time": "40m 34s", "remaining_time": "4h 9m 14s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043131}
+{"loss": 0.45809263, "grad_norm": 2.5572017, "learning_rate": 4.89e-06, "token_acc": 0.84823924, "epoch": 0.28304406, "global_step/max_steps": "106/750", "percentage": "14.13%", "elapsed_time": "40m 59s", "remaining_time": "4h 9m 2s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043099}
+{"loss": 0.4886952, "grad_norm": 2.58389633, "learning_rate": 4.89e-06, "token_acc": 0.83895445, "epoch": 0.28571429, "global_step/max_steps": "107/750", "percentage": "14.27%", "elapsed_time": "41m 26s", "remaining_time": "4h 9m 2s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043031}
+{"loss": 0.45709381, "grad_norm": 2.55273775, "learning_rate": 4.88e-06, "token_acc": 0.85046595, "epoch": 0.28838451, "global_step/max_steps": "108/750", "percentage": "14.40%", "elapsed_time": "41m 45s", "remaining_time": "4h 8m 13s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043107}
+{"loss": 0.53926206, "grad_norm": 2.74545125, "learning_rate": 4.88e-06, "token_acc": 0.82542646, "epoch": 0.29105474, "global_step/max_steps": "109/750", "percentage": "14.53%", "elapsed_time": "42m 12s", "remaining_time": "4h 8m 14s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043036}
+{"loss": 0.45649612, "grad_norm": 2.86313064, "learning_rate": 4.87e-06, "token_acc": 0.84482259, "epoch": 0.29372497, "global_step/max_steps": "110/750", "percentage": "14.67%", "elapsed_time": "42m 34s", "remaining_time": "4h 7m 40s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043068}
+{"loss": 0.47792593, "grad_norm": 2.66411764, "learning_rate": 4.87e-06, "token_acc": 0.84172088, "epoch": 0.29639519, "global_step/max_steps": "111/750", "percentage": "14.80%", "elapsed_time": "42m 56s", "remaining_time": "4h 7m 13s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043077}
+{"loss": 0.41383034, "grad_norm": 2.47914691, "learning_rate": 4.87e-06, "token_acc": 0.86086959, "epoch": 0.29906542, "global_step/max_steps": "112/750", "percentage": "14.93%", "elapsed_time": "43m 21s", "remaining_time": "4h 6m 58s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043054}
+{"loss": 0.48211718, "grad_norm": 2.51735911, "learning_rate": 4.86e-06, "token_acc": 0.83951277, "epoch": 0.30173565, "global_step/max_steps": "113/750", "percentage": "15.07%", "elapsed_time": "43m 45s", "remaining_time": "4h 6m 41s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043035}
+{"loss": 0.47428066, "grad_norm": 2.71498641, "learning_rate": 4.86e-06, "token_acc": 0.84289944, "epoch": 0.30440587, "global_step/max_steps": "114/750", "percentage": "15.20%", "elapsed_time": "44m 5s", "remaining_time": "4h 5m 59s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04309}
+{"loss": 0.46621677, "grad_norm": 2.80723392, "learning_rate": 4.86e-06, "token_acc": 0.84289783, "epoch": 0.3070761, "global_step/max_steps": "115/750", "percentage": "15.33%", "elapsed_time": "44m 27s", "remaining_time": "4h 5m 30s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043109}
+{"loss": 0.44936496, "grad_norm": 2.4883186, "learning_rate": 4.85e-06, "token_acc": 0.84700274, "epoch": 0.30974633, "global_step/max_steps": "116/750", "percentage": "15.47%", "elapsed_time": "44m 50s", "remaining_time": "4h 5m 3s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043118}
+{"loss": 0.50375044, "grad_norm": 2.64161239, "learning_rate": 4.85e-06, "token_acc": 0.82930297, "epoch": 0.31241656, "global_step/max_steps": "117/750", "percentage": "15.60%", "elapsed_time": "45m 12s", "remaining_time": "4h 4m 34s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043136}
+{"loss": 0.42930186, "grad_norm": 2.71646414, "learning_rate": 4.85e-06, "token_acc": 0.85322839, "epoch": 0.31508678, "global_step/max_steps": "118/750", "percentage": "15.73%", "elapsed_time": "45m 36s", "remaining_time": "4h 4m 18s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043116}
+{"loss": 0.45925462, "grad_norm": 2.5869505, "learning_rate": 4.84e-06, "token_acc": 0.84819764, "epoch": 0.31775701, "global_step/max_steps": "119/750", "percentage": "15.87%", "elapsed_time": "46m 1s", "remaining_time": "4h 4m 3s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04309}
+{"loss": 0.46710986, "grad_norm": 2.61488251, "learning_rate": 4.84e-06, "token_acc": 0.84349996, "epoch": 0.32042724, "global_step/max_steps": "120/750", "percentage": "16.00%", "elapsed_time": "46m 27s", "remaining_time": "4h 3m 52s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043054}
+{"loss": 0.46162307, "grad_norm": 2.60637056, "learning_rate": 4.83e-06, "token_acc": 0.84564304, "epoch": 0.32309746, "global_step/max_steps": "121/750", "percentage": "16.13%", "elapsed_time": "46m 48s", "remaining_time": "4h 3m 20s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043082}
+{"loss": 0.48382646, "grad_norm": 2.68137678, "learning_rate": 4.83e-06, "token_acc": 0.83610612, "epoch": 0.32576769, "global_step/max_steps": "122/750", "percentage": "16.27%", "elapsed_time": "47m 8s", "remaining_time": "4h 2m 40s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04313}
+{"loss": 0.44379669, "grad_norm": 2.60598634, "learning_rate": 4.83e-06, "token_acc": 0.85078573, "epoch": 0.32843792, "global_step/max_steps": "123/750", "percentage": "16.40%", "elapsed_time": "47m 32s", "remaining_time": "4h 2m 21s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043118}
+{"loss": 0.47259939, "grad_norm": 2.57065172, "learning_rate": 4.82e-06, "token_acc": 0.83720303, "epoch": 0.33110814, "global_step/max_steps": "124/750", "percentage": "16.53%", "elapsed_time": "47m 52s", "remaining_time": "4h 1m 41s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043167}
+{"loss": 0.43784583, "grad_norm": 2.66877627, "learning_rate": 4.82e-06, "token_acc": 0.85328507, "epoch": 0.33377837, "global_step/max_steps": "125/750", "percentage": "16.67%", "elapsed_time": "48m 13s", "remaining_time": "4h 1m 5s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043205}
+{"loss": 0.49782473, "grad_norm": 2.68240806, "learning_rate": 4.81e-06, "token_acc": 0.83495653, "epoch": 0.3364486, "global_step/max_steps": "126/750", "percentage": "16.80%", "elapsed_time": "48m 32s", "remaining_time": "4h 0m 25s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043256}
+{"loss": 0.45435789, "grad_norm": 2.76874097, "learning_rate": 4.81e-06, "token_acc": 0.84716034, "epoch": 0.33911883, "global_step/max_steps": "127/750", "percentage": "16.93%", "elapsed_time": "48m 55s", "remaining_time": "4h 0m 0s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043261}
+{"loss": 0.44461328, "grad_norm": 2.92010891, "learning_rate": 4.81e-06, "token_acc": 0.8496424, "epoch": 0.34178905, "global_step/max_steps": "128/750", "percentage": "17.07%", "elapsed_time": "49m 13s", "remaining_time": "3h 59m 12s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043336}
+{"loss": 0.4642204, "grad_norm": 2.76921943, "learning_rate": 4.8e-06, "token_acc": 0.84203398, "epoch": 0.34445928, "global_step/max_steps": "129/750", "percentage": "17.20%", "elapsed_time": "49m 34s", "remaining_time": "3h 58m 37s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043373}
+{"loss": 0.45947498, "grad_norm": 2.47558522, "learning_rate": 4.8e-06, "token_acc": 0.84888834, "epoch": 0.34712951, "global_step/max_steps": "130/750", "percentage": "17.33%", "elapsed_time": "49m 54s", "remaining_time": "3h 58m 0s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043417}
+{"loss": 0.50250471, "grad_norm": 2.6805787, "learning_rate": 4.79e-06, "token_acc": 0.82905984, "epoch": 0.34979973, "global_step/max_steps": "131/750", "percentage": "17.47%", "elapsed_time": "50m 14s", "remaining_time": "3h 57m 25s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043451}
+{"loss": 0.48010272, "grad_norm": 2.7863942, "learning_rate": 4.79e-06, "token_acc": 0.83882928, "epoch": 0.35246996, "global_step/max_steps": "132/750", "percentage": "17.60%", "elapsed_time": "50m 38s", "remaining_time": "3h 57m 5s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043444}
+{"loss": 0.44011635, "grad_norm": 2.48276467, "learning_rate": 4.78e-06, "token_acc": 0.85332054, "epoch": 0.35514019, "global_step/max_steps": "133/750", "percentage": "17.73%", "elapsed_time": "51m 3s", "remaining_time": "3h 56m 49s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043421}
+{"loss": 0.46361014, "grad_norm": 2.67930095, "learning_rate": 4.78e-06, "token_acc": 0.84670627, "epoch": 0.35781041, "global_step/max_steps": "134/750", "percentage": "17.87%", "elapsed_time": "51m 26s", "remaining_time": "3h 56m 30s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043409}
+{"loss": 0.45936275, "grad_norm": 2.46022723, "learning_rate": 4.77e-06, "token_acc": 0.84774953, "epoch": 0.36048064, "global_step/max_steps": "135/750", "percentage": "18.00%", "elapsed_time": "51m 51s", "remaining_time": "3h 56m 12s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043394}
+{"loss": 0.44871199, "grad_norm": 2.73454051, "learning_rate": 4.77e-06, "token_acc": 0.85209233, "epoch": 0.36315087, "global_step/max_steps": "136/750", "percentage": "18.13%", "elapsed_time": "52m 13s", "remaining_time": "3h 55m 48s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043397}
+{"loss": 0.44954139, "grad_norm": 2.58445544, "learning_rate": 4.77e-06, "token_acc": 0.85186768, "epoch": 0.36582109, "global_step/max_steps": "137/750", "percentage": "18.27%", "elapsed_time": "52m 33s", "remaining_time": "3h 55m 11s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04344}
+{"loss": 0.42780191, "grad_norm": 2.5114116, "learning_rate": 4.76e-06, "token_acc": 0.85548055, "epoch": 0.36849132, "global_step/max_steps": "138/750", "percentage": "18.40%", "elapsed_time": "52m 55s", "remaining_time": "3h 54m 43s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043454}
+{"loss": 0.42587936, "grad_norm": 2.4745295, "learning_rate": 4.76e-06, "token_acc": 0.85465199, "epoch": 0.37116155, "global_step/max_steps": "139/750", "percentage": "18.53%", "elapsed_time": "53m 15s", "remaining_time": "3h 54m 6s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043497}
+{"loss": 0.45688677, "grad_norm": 2.5234827, "learning_rate": 4.75e-06, "token_acc": 0.84741819, "epoch": 0.37383178, "global_step/max_steps": "140/750", "percentage": "18.67%", "elapsed_time": "53m 36s", "remaining_time": "3h 53m 33s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043529}
+{"loss": 0.45167503, "grad_norm": 2.60895767, "learning_rate": 4.75e-06, "token_acc": 0.85042483, "epoch": 0.376502, "global_step/max_steps": "141/750", "percentage": "18.80%", "elapsed_time": "54m 0s", "remaining_time": "3h 53m 15s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043513}
+{"loss": 0.51007497, "grad_norm": 2.75437796, "learning_rate": 4.74e-06, "token_acc": 0.82781076, "epoch": 0.37917223, "global_step/max_steps": "142/750", "percentage": "18.93%", "elapsed_time": "54m 23s", "remaining_time": "3h 52m 51s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043518}
+{"loss": 0.41679853, "grad_norm": 2.31693874, "learning_rate": 4.74e-06, "token_acc": 0.86460167, "epoch": 0.38184246, "global_step/max_steps": "143/750", "percentage": "19.07%", "elapsed_time": "54m 41s", "remaining_time": "3h 52m 9s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043577}
+{"loss": 0.42220882, "grad_norm": 2.46720865, "learning_rate": 4.73e-06, "token_acc": 0.86303991, "epoch": 0.38451268, "global_step/max_steps": "144/750", "percentage": "19.20%", "elapsed_time": "55m 3s", "remaining_time": "3h 51m 43s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043588}
+{"loss": 0.471122, "grad_norm": 2.47993604, "learning_rate": 4.73e-06, "token_acc": 0.84121382, "epoch": 0.38718291, "global_step/max_steps": "145/750", "percentage": "19.33%", "elapsed_time": "55m 23s", "remaining_time": "3h 51m 8s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043624}
+{"loss": 0.45192125, "grad_norm": 2.60728876, "learning_rate": 4.72e-06, "token_acc": 0.8470009, "epoch": 0.38985314, "global_step/max_steps": "146/750", "percentage": "19.47%", "elapsed_time": "55m 48s", "remaining_time": "3h 50m 51s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043604}
+{"loss": 0.47766036, "grad_norm": 2.42901845, "learning_rate": 4.72e-06, "token_acc": 0.83968115, "epoch": 0.39252336, "global_step/max_steps": "147/750", "percentage": "19.60%", "elapsed_time": "56m 12s", "remaining_time": "3h 50m 32s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043593}
+{"loss": 0.44105217, "grad_norm": 2.3236883, "learning_rate": 4.71e-06, "token_acc": 0.85267359, "epoch": 0.39519359, "global_step/max_steps": "148/750", "percentage": "19.73%", "elapsed_time": "56m 32s", "remaining_time": "3h 50m 0s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043621}
+{"loss": 0.46359736, "grad_norm": 2.51582729, "learning_rate": 4.71e-06, "token_acc": 0.84363395, "epoch": 0.39786382, "global_step/max_steps": "149/750", "percentage": "19.87%", "elapsed_time": "57m 2s", "remaining_time": "3h 50m 3s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04354}
+{"loss": 0.45162177, "grad_norm": 2.4586073, "learning_rate": 4.7e-06, "token_acc": 0.85317409, "epoch": 0.40053405, "global_step/max_steps": "150/750", "percentage": "20.00%", "elapsed_time": "57m 21s", "remaining_time": "3h 49m 27s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04358}
+{"loss": 0.46587312, "grad_norm": 2.46946774, "learning_rate": 4.7e-06, "token_acc": 0.8441934, "epoch": 0.40320427, "global_step/max_steps": "151/750", "percentage": "20.13%", "elapsed_time": "57m 46s", "remaining_time": "3h 49m 10s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043562}
+{"loss": 0.44039536, "grad_norm": 2.5210355, "learning_rate": 4.69e-06, "token_acc": 0.85168988, "epoch": 0.4058745, "global_step/max_steps": "152/750", "percentage": "20.27%", "elapsed_time": "58m 8s", "remaining_time": "3h 48m 42s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043577}
+{"loss": 0.38906497, "grad_norm": 2.28278218, "learning_rate": 4.69e-06, "token_acc": 0.86812884, "epoch": 0.40854473, "global_step/max_steps": "153/750", "percentage": "20.40%", "elapsed_time": "58m 32s", "remaining_time": "3h 48m 26s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043556}
+{"loss": 0.48725152, "grad_norm": 2.79372161, "learning_rate": 4.68e-06, "token_acc": 0.83746082, "epoch": 0.41121495, "global_step/max_steps": "154/750", "percentage": "20.53%", "elapsed_time": "58m 54s", "remaining_time": "3h 47m 59s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043568}
+{"loss": 0.50707674, "grad_norm": 2.663143, "learning_rate": 4.67e-06, "token_acc": 0.83335632, "epoch": 0.41388518, "global_step/max_steps": "155/750", "percentage": "20.67%", "elapsed_time": "59m 22s", "remaining_time": "3h 47m 53s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043515}
+{"loss": 0.46514195, "grad_norm": 2.49969478, "learning_rate": 4.67e-06, "token_acc": 0.8445397, "epoch": 0.41655541, "global_step/max_steps": "156/750", "percentage": "20.80%", "elapsed_time": "59m 42s", "remaining_time": "3h 47m 22s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043541}
+{"loss": 0.43904352, "grad_norm": 2.45751856, "learning_rate": 4.66e-06, "token_acc": 0.85044581, "epoch": 0.41922563, "global_step/max_steps": "157/750", "percentage": "20.93%", "elapsed_time": "1h 0m 4s", "remaining_time": "3h 46m 54s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043556}
+{"loss": 0.47703558, "grad_norm": 2.52444432, "learning_rate": 4.66e-06, "token_acc": 0.8391282, "epoch": 0.42189586, "global_step/max_steps": "158/750", "percentage": "21.07%", "elapsed_time": "1h 0m 24s", "remaining_time": "3h 46m 20s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043592}
+{"loss": 0.46014819, "grad_norm": 2.58692895, "learning_rate": 4.65e-06, "token_acc": 0.84484124, "epoch": 0.42456609, "global_step/max_steps": "159/750", "percentage": "21.20%", "elapsed_time": "1h 0m 48s", "remaining_time": "3h 46m 2s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043575}
+{"loss": 0.44626755, "grad_norm": 2.5726889, "learning_rate": 4.65e-06, "token_acc": 0.85418391, "epoch": 0.42723632, "global_step/max_steps": "160/750", "percentage": "21.33%", "elapsed_time": "1h 1m 11s", "remaining_time": "3h 45m 39s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043577}
+{"loss": 0.43762302, "grad_norm": 2.57624818, "learning_rate": 4.64e-06, "token_acc": 0.85414863, "epoch": 0.42990654, "global_step/max_steps": "161/750", "percentage": "21.47%", "elapsed_time": "1h 1m 33s", "remaining_time": "3h 45m 13s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043586}
+{"loss": 0.45935273, "grad_norm": 2.4689486, "learning_rate": 4.64e-06, "token_acc": 0.85112906, "epoch": 0.43257677, "global_step/max_steps": "162/750", "percentage": "21.60%", "elapsed_time": "1h 1m 56s", "remaining_time": "3h 44m 48s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043592}
+{"loss": 0.42275417, "grad_norm": 2.43153137, "learning_rate": 4.63e-06, "token_acc": 0.8603887, "epoch": 0.435247, "global_step/max_steps": "163/750", "percentage": "21.73%", "elapsed_time": "1h 2m 20s", "remaining_time": "3h 44m 29s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04358}
+{"loss": 0.40657133, "grad_norm": 2.68355566, "learning_rate": 4.62e-06, "token_acc": 0.86099553, "epoch": 0.43791722, "global_step/max_steps": "164/750", "percentage": "21.87%", "elapsed_time": "1h 2m 40s", "remaining_time": "3h 43m 57s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04361}
+{"loss": 0.4761599, "grad_norm": 2.48708559, "learning_rate": 4.62e-06, "token_acc": 0.84103781, "epoch": 0.44058745, "global_step/max_steps": "165/750", "percentage": "22.00%", "elapsed_time": "1h 3m 7s", "remaining_time": "3h 43m 48s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043563}
+{"loss": 0.47588837, "grad_norm": 2.76328807, "learning_rate": 4.61e-06, "token_acc": 0.84062725, "epoch": 0.44325768, "global_step/max_steps": "166/750", "percentage": "22.13%", "elapsed_time": "1h 3m 32s", "remaining_time": "3h 43m 31s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043545}
+{"loss": 0.4535282, "grad_norm": 2.5396113, "learning_rate": 4.61e-06, "token_acc": 0.84282684, "epoch": 0.4459279, "global_step/max_steps": "167/750", "percentage": "22.27%", "elapsed_time": "1h 3m 55s", "remaining_time": "3h 43m 9s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043541}
+{"loss": 0.41487816, "grad_norm": 2.47265134, "learning_rate": 4.6e-06, "token_acc": 0.85979295, "epoch": 0.44859813, "global_step/max_steps": "168/750", "percentage": "22.40%", "elapsed_time": "1h 4m 19s", "remaining_time": "3h 42m 52s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043524}
+{"loss": 0.44197559, "grad_norm": 2.55595802, "learning_rate": 4.59e-06, "token_acc": 0.84797686, "epoch": 0.45126836, "global_step/max_steps": "169/750", "percentage": "22.53%", "elapsed_time": "1h 4m 39s", "remaining_time": "3h 42m 16s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043563}
+{"loss": 0.42982167, "grad_norm": 2.3608869, "learning_rate": 4.59e-06, "token_acc": 0.85456234, "epoch": 0.45393858, "global_step/max_steps": "170/750", "percentage": "22.67%", "elapsed_time": "1h 5m 2s", "remaining_time": "3h 41m 54s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043562}
+{"loss": 0.42837453, "grad_norm": 2.58920862, "learning_rate": 4.58e-06, "token_acc": 0.85819733, "epoch": 0.45660881, "global_step/max_steps": "171/750", "percentage": "22.80%", "elapsed_time": "1h 5m 22s", "remaining_time": "3h 41m 20s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043598}

v6-20250917-134949/runs/events.out.tfevents.1758088221.TENCENT64.site.222971.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a821e778e9c842081277c225575c484a82719f4a958746cc7317fbb0ee2c29ee
+size 52504

v6-20250917-134949/val_dataset.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff