Upload folder using huggingface_hub
Browse files- log/20250917-13:20:02.log +0 -0
- log/20250917-13:25:53.log +676 -0
- log/20250917-13:35:09.log +676 -0
- log/20250917-13:41:16.log +676 -0
- log/20250917-13:44:32.log +676 -0
- log/20250917-13:46:26.log +675 -0
- log/20250917-13:49:21.log +0 -0
- v5-20250917-134655/args.json +384 -0
- v5-20250917-134655/images/train_epoch.png +0 -0
- v5-20250917-134655/images/train_grad_norm.png +0 -0
- v5-20250917-134655/images/train_learning_rate.png +0 -0
- v5-20250917-134655/images/train_loss.png +0 -0
- v5-20250917-134655/images/train_token_acc.png +0 -0
- v5-20250917-134655/logging.jsonl +3 -0
- v5-20250917-134655/runs/events.out.tfevents.1758088071.TENCENT64.site.218247.0 +3 -0
- v5-20250917-134655/val_dataset.jsonl +0 -0
- v6-20250917-134949/args.json +384 -0
- v6-20250917-134949/logging.jsonl +171 -0
- v6-20250917-134949/runs/events.out.tfevents.1758088221.TENCENT64.site.222971.0 +3 -0
- v6-20250917-134949/val_dataset.jsonl +0 -0
log/20250917-13:20:02.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
log/20250917-13:25:53.log
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
|
| 2 |
+
|
| 3 |
+
*****************************************
|
| 4 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 5 |
+
*****************************************
|
| 6 |
+
[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
|
| 7 |
+
[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
|
| 8 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 9 |
+
[2025-09-17 13:26:10,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 10 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 11 |
+
[2025-09-17 13:26:11,277] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 12 |
+
[2025-09-17 13:26:11,772] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 13 |
+
[2025-09-17 13:26:11,781] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 14 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 15 |
+
[2025-09-17 13:26:12,631] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 16 |
+
[2025-09-17 13:26:12,640] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 17 |
+
[2025-09-17 13:26:12,992] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 18 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 19 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 20 |
+
[2025-09-17 13:26:14,391] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 21 |
+
[2025-09-17 13:26:14,400] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 22 |
+
[2025-09-17 13:26:14,793] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 23 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 24 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 25 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 26 |
+
[INFO:swift] Setting args.lazy_tokenize: False
|
| 27 |
+
[2025-09-17 13:26:16,173] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 28 |
+
[2025-09-17 13:26:16,182] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 29 |
+
[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
|
| 30 |
+
[2025-09-17 13:26:16,449] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 31 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 32 |
+
[2025-09-17 13:26:17,770] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 33 |
+
[2025-09-17 13:26:17,779] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 34 |
+
[2025-09-17 13:26:17,779] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 35 |
+
[2025-09-17 13:26:18,258] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 36 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 37 |
+
[2025-09-17 13:26:19,673] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 38 |
+
[2025-09-17 13:26:19,683] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 39 |
+
[2025-09-17 13:26:20,035] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 40 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 41 |
+
[2025-09-17 13:26:21,373] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 42 |
+
[2025-09-17 13:26:21,381] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 43 |
+
[2025-09-17 13:26:21,413] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 44 |
+
[2025-09-17 13:26:22,788] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 45 |
+
[2025-09-17 13:26:22,801] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 46 |
+
[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625
|
| 47 |
+
[INFO:swift] Global seed set to 42
|
| 48 |
+
[INFO:swift] args: TrainArguments(
|
| 49 |
+
_n_gpu=-1,
|
| 50 |
+
acc_strategy=token,
|
| 51 |
+
accelerator_config={'dispatch_batches': False},
|
| 52 |
+
adafactor=False,
|
| 53 |
+
adalora_beta1=0.85,
|
| 54 |
+
adalora_beta2=0.85,
|
| 55 |
+
adalora_deltaT=1,
|
| 56 |
+
adalora_init_r=12,
|
| 57 |
+
adalora_orth_reg_weight=0.5,
|
| 58 |
+
adalora_target_r=8,
|
| 59 |
+
adalora_tfinal=0,
|
| 60 |
+
adalora_tinit=0,
|
| 61 |
+
adam_beta1=0.9,
|
| 62 |
+
adam_beta2=0.95,
|
| 63 |
+
adam_epsilon=1e-08,
|
| 64 |
+
adapter_act=gelu,
|
| 65 |
+
adapter_length=128,
|
| 66 |
+
adapters=[],
|
| 67 |
+
add_version=True,
|
| 68 |
+
agent_template=None,
|
| 69 |
+
aligner_lr=None,
|
| 70 |
+
attn_impl=None,
|
| 71 |
+
auto_find_batch_size=False,
|
| 72 |
+
average_tokens_across_devices=True,
|
| 73 |
+
batch_eval_metrics=False,
|
| 74 |
+
bf16=True,
|
| 75 |
+
bf16_full_eval=False,
|
| 76 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 77 |
+
bnb_4bit_quant_storage=None,
|
| 78 |
+
bnb_4bit_quant_type=nf4,
|
| 79 |
+
bnb_4bit_use_double_quant=True,
|
| 80 |
+
boft_block_num=0,
|
| 81 |
+
boft_block_size=4,
|
| 82 |
+
boft_dropout=0.0,
|
| 83 |
+
boft_n_butterfly_factor=1,
|
| 84 |
+
cached_dataset=[],
|
| 85 |
+
channels=None,
|
| 86 |
+
check_model=True,
|
| 87 |
+
ckpt_dir=None,
|
| 88 |
+
columns={},
|
| 89 |
+
create_checkpoint_symlink=False,
|
| 90 |
+
custom_dataset_info=[],
|
| 91 |
+
custom_register_path=[],
|
| 92 |
+
data_seed=42,
|
| 93 |
+
dataloader_drop_last=False,
|
| 94 |
+
dataloader_num_workers=48,
|
| 95 |
+
dataloader_persistent_workers=False,
|
| 96 |
+
dataloader_pin_memory=True,
|
| 97 |
+
dataloader_prefetch_factor=None,
|
| 98 |
+
dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
|
| 99 |
+
dataset_num_proc=100,
|
| 100 |
+
dataset_shuffle=True,
|
| 101 |
+
ddp_backend=None,
|
| 102 |
+
ddp_broadcast_buffers=None,
|
| 103 |
+
ddp_bucket_cap_mb=None,
|
| 104 |
+
ddp_find_unused_parameters=None,
|
| 105 |
+
ddp_timeout=18000000,
|
| 106 |
+
debug=None,
|
| 107 |
+
deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
|
| 108 |
+
deepspeed_autotp_size=None,
|
| 109 |
+
device_map=None,
|
| 110 |
+
disable_tqdm=None,
|
| 111 |
+
do_eval=False,
|
| 112 |
+
do_predict=False,
|
| 113 |
+
do_train=False,
|
| 114 |
+
download_mode=reuse_dataset_if_exists,
|
| 115 |
+
ds3_gather_for_generation=True,
|
| 116 |
+
early_stop_interval=None,
|
| 117 |
+
enable_dft_loss=False,
|
| 118 |
+
eval_accumulation_steps=None,
|
| 119 |
+
eval_dataset=[],
|
| 120 |
+
eval_dataset_args=None,
|
| 121 |
+
eval_delay=0,
|
| 122 |
+
eval_do_concat_batches=True,
|
| 123 |
+
eval_generation_config=None,
|
| 124 |
+
eval_limit=None,
|
| 125 |
+
eval_on_start=False,
|
| 126 |
+
eval_steps=2000.0,
|
| 127 |
+
eval_strategy=epoch,
|
| 128 |
+
eval_use_evalscope=False,
|
| 129 |
+
eval_use_gather_object=False,
|
| 130 |
+
external_plugins=[],
|
| 131 |
+
extra_eval_args=None,
|
| 132 |
+
fourier_n_frequency=2000,
|
| 133 |
+
fourier_scaling=300.0,
|
| 134 |
+
fp16=False,
|
| 135 |
+
fp16_backend=auto,
|
| 136 |
+
fp16_full_eval=False,
|
| 137 |
+
fp16_opt_level=O1,
|
| 138 |
+
freeze_aligner=False,
|
| 139 |
+
freeze_llm=False,
|
| 140 |
+
freeze_parameters=[],
|
| 141 |
+
freeze_parameters_ratio=0.0,
|
| 142 |
+
freeze_parameters_regex=None,
|
| 143 |
+
freeze_vit=True,
|
| 144 |
+
fsdp=,
|
| 145 |
+
fsdp_config=None,
|
| 146 |
+
fsdp_min_num_params=0,
|
| 147 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
| 148 |
+
full_determinism=False,
|
| 149 |
+
galore_cos_threshold=0.4,
|
| 150 |
+
galore_gamma_proj=2,
|
| 151 |
+
galore_optim_per_parameter=False,
|
| 152 |
+
galore_proj_bits=4,
|
| 153 |
+
galore_proj_group_size=256,
|
| 154 |
+
galore_proj_quant=False,
|
| 155 |
+
galore_proj_type=std,
|
| 156 |
+
galore_quantization=False,
|
| 157 |
+
galore_queue_size=5,
|
| 158 |
+
galore_rank=128,
|
| 159 |
+
galore_scale=1.0,
|
| 160 |
+
galore_target_modules=None,
|
| 161 |
+
galore_update_proj_gap=50,
|
| 162 |
+
galore_with_embedding=False,
|
| 163 |
+
generation_config=None,
|
| 164 |
+
generation_max_length=None,
|
| 165 |
+
generation_num_beams=None,
|
| 166 |
+
gradient_accumulation_steps=4,
|
| 167 |
+
gradient_checkpointing=True,
|
| 168 |
+
gradient_checkpointing_kwargs=None,
|
| 169 |
+
greater_is_better=False,
|
| 170 |
+
group_by_length=False,
|
| 171 |
+
half_precision_backend=auto,
|
| 172 |
+
hqq_axis=None,
|
| 173 |
+
hub_always_push=False,
|
| 174 |
+
hub_model_id=None,
|
| 175 |
+
hub_private_repo=None,
|
| 176 |
+
hub_revision=None,
|
| 177 |
+
hub_strategy=every_save,
|
| 178 |
+
hub_token=<HUB_TOKEN>,
|
| 179 |
+
ignore_args_error=False,
|
| 180 |
+
ignore_data_skip=False,
|
| 181 |
+
include_for_metrics=[],
|
| 182 |
+
include_inputs_for_metrics=False,
|
| 183 |
+
include_num_input_tokens_seen=False,
|
| 184 |
+
include_tokens_per_second=False,
|
| 185 |
+
init_strategy=None,
|
| 186 |
+
init_weights=True,
|
| 187 |
+
interleave_prob=None,
|
| 188 |
+
jit_mode_eval=False,
|
| 189 |
+
label_names=None,
|
| 190 |
+
label_smoothing_factor=0.0,
|
| 191 |
+
lazy_tokenize=False,
|
| 192 |
+
learning_rate=5e-06,
|
| 193 |
+
length_column_name=length,
|
| 194 |
+
liger_kernel_config=None,
|
| 195 |
+
lisa_activated_layers=0,
|
| 196 |
+
lisa_step_interval=20,
|
| 197 |
+
llamapro_num_groups=None,
|
| 198 |
+
llamapro_num_new_blocks=4,
|
| 199 |
+
load_args=False,
|
| 200 |
+
load_best_model_at_end=False,
|
| 201 |
+
load_data_args=False,
|
| 202 |
+
load_from_cache_file=True,
|
| 203 |
+
local_rank=0,
|
| 204 |
+
local_repo_path=None,
|
| 205 |
+
log_level=passive,
|
| 206 |
+
log_level_replica=warning,
|
| 207 |
+
log_on_each_node=True,
|
| 208 |
+
logging_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625/runs,
|
| 209 |
+
logging_first_step=True,
|
| 210 |
+
logging_nan_inf_filter=True,
|
| 211 |
+
logging_steps=1,
|
| 212 |
+
logging_strategy=steps,
|
| 213 |
+
logprobs=False,
|
| 214 |
+
lora_alpha=32,
|
| 215 |
+
lora_bias=none,
|
| 216 |
+
lora_dropout=0.05,
|
| 217 |
+
lora_dtype=None,
|
| 218 |
+
lora_ga_batch_size=2,
|
| 219 |
+
lora_ga_direction=ArB2r,
|
| 220 |
+
lora_ga_iters=2,
|
| 221 |
+
lora_ga_max_length=1024,
|
| 222 |
+
lora_ga_scale=stable,
|
| 223 |
+
lora_ga_stable_gamma=16,
|
| 224 |
+
lora_modules=[],
|
| 225 |
+
lora_rank=8,
|
| 226 |
+
lorap_lr_ratio=None,
|
| 227 |
+
loss_scale=default,
|
| 228 |
+
loss_type=None,
|
| 229 |
+
lr_scheduler_kwargs=None,
|
| 230 |
+
lr_scheduler_type=cosine,
|
| 231 |
+
max_epochs=None,
|
| 232 |
+
max_grad_norm=1.0,
|
| 233 |
+
max_length=16240,
|
| 234 |
+
max_memory={},
|
| 235 |
+
max_model_len=None,
|
| 236 |
+
max_new_tokens=64,
|
| 237 |
+
max_pixels=None,
|
| 238 |
+
max_steps=-1,
|
| 239 |
+
metric=None,
|
| 240 |
+
metric_for_best_model=loss,
|
| 241 |
+
model=Qwen/Qwen2.5-7B-Instruct,
|
| 242 |
+
model_author=None,
|
| 243 |
+
model_kwargs={},
|
| 244 |
+
model_name=None,
|
| 245 |
+
model_revision=None,
|
| 246 |
+
model_type=qwen2_5,
|
| 247 |
+
modules_to_save=[],
|
| 248 |
+
mp_parameters=,
|
| 249 |
+
neftune_noise_alpha=None,
|
| 250 |
+
new_special_tokens=[],
|
| 251 |
+
no_cuda=False,
|
| 252 |
+
norm_bbox=None,
|
| 253 |
+
num_beams=1,
|
| 254 |
+
num_labels=None,
|
| 255 |
+
num_train_epochs=2.0,
|
| 256 |
+
optim=adamw_torch_fused,
|
| 257 |
+
optim_args=None,
|
| 258 |
+
optim_target_modules=None,
|
| 259 |
+
optimizer=None,
|
| 260 |
+
output_dir=/group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625,
|
| 261 |
+
overwrite_output_dir=False,
|
| 262 |
+
packing=False,
|
| 263 |
+
packing_length=None,
|
| 264 |
+
padding_free=False,
|
| 265 |
+
padding_side=right,
|
| 266 |
+
past_index=-1,
|
| 267 |
+
per_device_eval_batch_size=1,
|
| 268 |
+
per_device_train_batch_size=2,
|
| 269 |
+
predict_with_generate=False,
|
| 270 |
+
prediction_loss_only=False,
|
| 271 |
+
problem_type=None,
|
| 272 |
+
push_to_hub=False,
|
| 273 |
+
push_to_hub_model_id=None,
|
| 274 |
+
push_to_hub_organization=None,
|
| 275 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
| 276 |
+
quant_bits=None,
|
| 277 |
+
quant_method=None,
|
| 278 |
+
ray_scope=last,
|
| 279 |
+
reft_args=None,
|
| 280 |
+
reft_intervention_type=LoreftIntervention,
|
| 281 |
+
reft_layer_key=None,
|
| 282 |
+
reft_layers=None,
|
| 283 |
+
reft_rank=4,
|
| 284 |
+
remove_unused_columns=True,
|
| 285 |
+
repetition_penalty=None,
|
| 286 |
+
report_to=['tensorboard'],
|
| 287 |
+
response_prefix=None,
|
| 288 |
+
restore_callback_states_from_checkpoint=False,
|
| 289 |
+
resume_from_checkpoint=None,
|
| 290 |
+
resume_only_model=False,
|
| 291 |
+
rope_scaling=None,
|
| 292 |
+
router_aux_loss_coef=0.0,
|
| 293 |
+
run_name=/group/40143/hongzhuyi/ms-swift/output/v1-20250917-132625,
|
| 294 |
+
save_on_each_node=False,
|
| 295 |
+
save_only_model=False,
|
| 296 |
+
save_safetensors=True,
|
| 297 |
+
save_steps=500,
|
| 298 |
+
save_strategy=epoch,
|
| 299 |
+
save_total_limit=None,
|
| 300 |
+
seed=42,
|
| 301 |
+
sequence_parallel_size=1,
|
| 302 |
+
shuffle_buffer_size=1000,
|
| 303 |
+
skip_memory_metrics=True,
|
| 304 |
+
sortish_sampler=False,
|
| 305 |
+
split_dataset_ratio=0.001,
|
| 306 |
+
stop_words=[],
|
| 307 |
+
stopping_strategy=first_exhausted,
|
| 308 |
+
stream=False,
|
| 309 |
+
streaming=False,
|
| 310 |
+
strict=False,
|
| 311 |
+
swanlab_exp_name=None,
|
| 312 |
+
swanlab_lark_secret=None,
|
| 313 |
+
swanlab_lark_webhook_url=None,
|
| 314 |
+
swanlab_mode=cloud,
|
| 315 |
+
swanlab_project=None,
|
| 316 |
+
swanlab_token=<SWANLAB_TOKEN>,
|
| 317 |
+
swanlab_workspace=None,
|
| 318 |
+
system=None,
|
| 319 |
+
target_modules=['all-linear'],
|
| 320 |
+
target_regex=None,
|
| 321 |
+
task_type=causal_lm,
|
| 322 |
+
temperature=0.0,
|
| 323 |
+
template=qwen2_5,
|
| 324 |
+
template_backend=swift,
|
| 325 |
+
tf32=None,
|
| 326 |
+
top_k=None,
|
| 327 |
+
top_logprobs=None,
|
| 328 |
+
top_p=None,
|
| 329 |
+
torch_compile=False,
|
| 330 |
+
torch_compile_backend=None,
|
| 331 |
+
torch_compile_mode=None,
|
| 332 |
+
torch_dtype=torch.bfloat16,
|
| 333 |
+
torch_empty_cache_steps=None,
|
| 334 |
+
torchdynamo=None,
|
| 335 |
+
tpu_metrics_debug=False,
|
| 336 |
+
tpu_num_cores=None,
|
| 337 |
+
train_dataloader_shuffle=True,
|
| 338 |
+
train_type=full,
|
| 339 |
+
trainable_parameters=[],
|
| 340 |
+
trainable_parameters_regex=None,
|
| 341 |
+
truncation_strategy=delete,
|
| 342 |
+
tuner_backend=peft,
|
| 343 |
+
use_chat_template=True,
|
| 344 |
+
use_cpu=False,
|
| 345 |
+
use_dora=False,
|
| 346 |
+
use_flash_ckpt=False,
|
| 347 |
+
use_galore=False,
|
| 348 |
+
use_hf=False,
|
| 349 |
+
use_ipex=False,
|
| 350 |
+
use_legacy_prediction_loop=False,
|
| 351 |
+
use_liger_kernel=False,
|
| 352 |
+
use_logits_to_keep=None,
|
| 353 |
+
use_mps_device=False,
|
| 354 |
+
use_rslora=False,
|
| 355 |
+
use_swift_lora=False,
|
| 356 |
+
val_dataset=[],
|
| 357 |
+
val_dataset_shuffle=False,
|
| 358 |
+
vera_d_initial=0.1,
|
| 359 |
+
vera_dropout=0.0,
|
| 360 |
+
vera_projection_prng_key=0,
|
| 361 |
+
vera_rank=256,
|
| 362 |
+
vit_gradient_checkpointing=None,
|
| 363 |
+
vit_lr=None,
|
| 364 |
+
warmup_ratio=0.05,
|
| 365 |
+
warmup_steps=0,
|
| 366 |
+
weight_decay=0.1,
|
| 367 |
+
zero_hpz_partition_size=None,
|
| 368 |
+
)
|
| 369 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 370 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 371 |
+
[2025-09-17 13:26:28,459] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 372 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 373 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 374 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 375 |
+
[INFO:swift] model_kwargs: {'device_map': None}
|
| 376 |
+
[2025-09-17 13:26:30,183] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 377 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 378 |
+
[2025-09-17 13:26:31,829] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 379 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 380 |
+
[2025-09-17 13:26:33,474] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 381 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 382 |
+
[2025-09-17 13:26:35,123] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 383 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 384 |
+
[2025-09-17 13:26:36,825] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 385 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 386 |
+
[2025-09-17 13:26:38,513] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 387 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 388 |
+
[2025-09-17 13:26:40,270] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 389 |
+
[2025-09-17 13:26:40,417] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
|
| 399 |
+
"architectures": [
|
| 400 |
+
"Qwen2ForCausalLM"
|
| 401 |
+
],
|
| 402 |
+
"attention_dropout": 0.0,
|
| 403 |
+
"bos_token_id": 151643,
|
| 404 |
+
"eos_token_id": 151645,
|
| 405 |
+
"hidden_act": "silu",
|
| 406 |
+
"hidden_size": 3584,
|
| 407 |
+
"initializer_range": 0.02,
|
| 408 |
+
"intermediate_size": 18944,
|
| 409 |
+
"layer_types": [
|
| 410 |
+
"full_attention",
|
| 411 |
+
"full_attention",
|
| 412 |
+
"full_attention",
|
| 413 |
+
"full_attention",
|
| 414 |
+
"full_attention",
|
| 415 |
+
"full_attention",
|
| 416 |
+
"full_attention",
|
| 417 |
+
"full_attention",
|
| 418 |
+
"full_attention",
|
| 419 |
+
"full_attention",
|
| 420 |
+
"full_attention",
|
| 421 |
+
"full_attention",
|
| 422 |
+
"full_attention",
|
| 423 |
+
"full_attention",
|
| 424 |
+
"full_attention",
|
| 425 |
+
"full_attention",
|
| 426 |
+
"full_attention",
|
| 427 |
+
"full_attention",
|
| 428 |
+
"full_attention",
|
| 429 |
+
"full_attention",
|
| 430 |
+
"full_attention",
|
| 431 |
+
"full_attention",
|
| 432 |
+
"full_attention",
|
| 433 |
+
"full_attention",
|
| 434 |
+
"full_attention",
|
| 435 |
+
"full_attention",
|
| 436 |
+
"full_attention",
|
| 437 |
+
"full_attention"
|
| 438 |
+
],
|
| 439 |
+
"max_position_embeddings": 32768,
|
| 440 |
+
"max_window_layers": 28,
|
| 441 |
+
"model_type": "qwen2",
|
| 442 |
+
"num_attention_heads": 28,
|
| 443 |
+
"num_hidden_layers": 28,
|
| 444 |
+
"num_key_value_heads": 4,
|
| 445 |
+
"pad_token_id": 151643,
|
| 446 |
+
"rms_norm_eps": 1e-06,
|
| 447 |
+
"rope_scaling": null,
|
| 448 |
+
"rope_theta": 1000000.0,
|
| 449 |
+
"sliding_window": null,
|
| 450 |
+
"tie_word_embeddings": false,
|
| 451 |
+
"torch_dtype": "bfloat16",
|
| 452 |
+
"transformers_version": "4.55.4",
|
| 453 |
+
"use_cache": true,
|
| 454 |
+
"use_sliding_window": false,
|
| 455 |
+
"vocab_size": 152064
|
| 456 |
+
}
|
| 457 |
+
, task_type='causal_lm', num_labels=None)
|
| 458 |
+
[INFO:swift] model.generation_config: GenerationConfig {
|
| 459 |
+
"bos_token_id": 151643,
|
| 460 |
+
"eos_token_id": [
|
| 461 |
+
151645,
|
| 462 |
+
151643
|
| 463 |
+
],
|
| 464 |
+
"max_new_tokens": 64,
|
| 465 |
+
"pad_token_id": 151643,
|
| 466 |
+
"repetition_penalty": 1.05
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
|
| 470 |
+
[INFO:swift] max_length: 16240
|
| 471 |
+
[INFO:swift] response_prefix: ''
|
| 472 |
+
[INFO:swift] agent_template: hermes
|
| 473 |
+
[INFO:swift] Start time of running main: 2025-09-17 13:26:42.816518
|
| 474 |
+
[INFO:swift] swift.__version__: 3.8.0.dev0
|
| 475 |
+
[rank1]: Traceback (most recent call last):
|
| 476 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 477 |
+
[rank1]: sft_main()
|
| 478 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 479 |
+
[rank1]: return SwiftSft(args).main()
|
| 480 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 481 |
+
[rank1]: result = self.run()
|
| 482 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 483 |
+
[rank1]: train_dataset, val_dataset = self._prepare_dataset()
|
| 484 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 485 |
+
[rank1]: train_dataset, val_dataset = self._get_dataset()
|
| 486 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 487 |
+
[rank1]: train_dataset, val_dataset = load_dataset(
|
| 488 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 489 |
+
[rank1]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 490 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 491 |
+
[rank1]: dataset = DatasetLoader._load_repo_dataset(
|
| 492 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 493 |
+
[rank1]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 494 |
+
[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 495 |
+
[rank6]: Traceback (most recent call last):
|
| 496 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 497 |
+
[rank6]: sft_main()
|
| 498 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 499 |
+
[rank6]: return SwiftSft(args).main()
|
| 500 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 501 |
+
[rank6]: result = self.run()
|
| 502 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 503 |
+
[rank6]: train_dataset, val_dataset = self._prepare_dataset()
|
| 504 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 505 |
+
[rank6]: train_dataset, val_dataset = self._get_dataset()
|
| 506 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 507 |
+
[rank6]: train_dataset, val_dataset = load_dataset(
|
| 508 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 509 |
+
[rank6]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 510 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 511 |
+
[rank6]: dataset = DatasetLoader._load_repo_dataset(
|
| 512 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 513 |
+
[rank6]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 514 |
+
[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 515 |
+
[rank5]: Traceback (most recent call last):
|
| 516 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 517 |
+
[rank5]: sft_main()
|
| 518 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 519 |
+
[rank5]: return SwiftSft(args).main()
|
| 520 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 521 |
+
[rank5]: result = self.run()
|
| 522 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 523 |
+
[rank5]: train_dataset, val_dataset = self._prepare_dataset()
|
| 524 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 525 |
+
[rank5]: train_dataset, val_dataset = self._get_dataset()
|
| 526 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 527 |
+
[rank5]: train_dataset, val_dataset = load_dataset(
|
| 528 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 529 |
+
[rank5]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 530 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 531 |
+
[rank5]: dataset = DatasetLoader._load_repo_dataset(
|
| 532 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 533 |
+
[rank5]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 534 |
+
[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 535 |
+
[rank4]: Traceback (most recent call last):
|
| 536 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 537 |
+
[rank4]: sft_main()
|
| 538 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 539 |
+
[rank4]: return SwiftSft(args).main()
|
| 540 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 541 |
+
[rank4]: result = self.run()
|
| 542 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 543 |
+
[rank4]: train_dataset, val_dataset = self._prepare_dataset()
|
| 544 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 545 |
+
[rank4]: train_dataset, val_dataset = self._get_dataset()
|
| 546 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 547 |
+
[rank4]: train_dataset, val_dataset = load_dataset(
|
| 548 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 549 |
+
[rank4]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 550 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 551 |
+
[rank4]: dataset = DatasetLoader._load_repo_dataset(
|
| 552 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 553 |
+
[rank4]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 554 |
+
[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 555 |
+
[rank0]: Traceback (most recent call last):
|
| 556 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 557 |
+
[rank0]: sft_main()
|
| 558 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 559 |
+
[rank0]: return SwiftSft(args).main()
|
| 560 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 561 |
+
[rank0]: result = self.run()
|
| 562 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 563 |
+
[rank0]: train_dataset, val_dataset = self._prepare_dataset()
|
| 564 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 565 |
+
[rank0]: train_dataset, val_dataset = self._get_dataset()
|
| 566 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 567 |
+
[rank0]: train_dataset, val_dataset = load_dataset(
|
| 568 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 569 |
+
[rank0]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 570 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 571 |
+
[rank0]: dataset = DatasetLoader._load_repo_dataset(
|
| 572 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 573 |
+
[rank0]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 574 |
+
[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 575 |
+
[rank3]: Traceback (most recent call last):
|
| 576 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 577 |
+
[rank3]: sft_main()
|
| 578 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 579 |
+
[rank3]: return SwiftSft(args).main()
|
| 580 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 581 |
+
[rank3]: result = self.run()
|
| 582 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 583 |
+
[rank3]: train_dataset, val_dataset = self._prepare_dataset()
|
| 584 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 585 |
+
[rank3]: train_dataset, val_dataset = self._get_dataset()
|
| 586 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 587 |
+
[rank3]: train_dataset, val_dataset = load_dataset(
|
| 588 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 589 |
+
[rank3]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 590 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 591 |
+
[rank3]: dataset = DatasetLoader._load_repo_dataset(
|
| 592 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 593 |
+
[rank3]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 594 |
+
[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 595 |
+
[rank7]: Traceback (most recent call last):
|
| 596 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 597 |
+
[rank7]: sft_main()
|
| 598 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 599 |
+
[rank7]: return SwiftSft(args).main()
|
| 600 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 601 |
+
[rank7]: result = self.run()
|
| 602 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 603 |
+
[rank7]: train_dataset, val_dataset = self._prepare_dataset()
|
| 604 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 605 |
+
[rank7]: train_dataset, val_dataset = self._get_dataset()
|
| 606 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 607 |
+
[rank7]: train_dataset, val_dataset = load_dataset(
|
| 608 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 609 |
+
[rank7]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 610 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 611 |
+
[rank7]: dataset = DatasetLoader._load_repo_dataset(
|
| 612 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 613 |
+
[rank7]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 614 |
+
[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 615 |
+
[rank2]: Traceback (most recent call last):
|
| 616 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 617 |
+
[rank2]: sft_main()
|
| 618 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 619 |
+
[rank2]: return SwiftSft(args).main()
|
| 620 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 621 |
+
[rank2]: result = self.run()
|
| 622 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 623 |
+
[rank2]: train_dataset, val_dataset = self._prepare_dataset()
|
| 624 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 625 |
+
[rank2]: train_dataset, val_dataset = self._get_dataset()
|
| 626 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 627 |
+
[rank2]: train_dataset, val_dataset = load_dataset(
|
| 628 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 629 |
+
[rank2]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 630 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 631 |
+
[rank2]: dataset = DatasetLoader._load_repo_dataset(
|
| 632 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 633 |
+
[rank2]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 634 |
+
[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 635 |
+
[rank0]:[W917 13:26:48.475248646 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 636 |
+
W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212634 closing signal SIGTERM
|
| 637 |
+
W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212636 closing signal SIGTERM
|
| 638 |
+
W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212637 closing signal SIGTERM
|
| 639 |
+
W0917 13:26:49.060000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212638 closing signal SIGTERM
|
| 640 |
+
W0917 13:26:49.062000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212639 closing signal SIGTERM
|
| 641 |
+
W0917 13:26:49.064000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212640 closing signal SIGTERM
|
| 642 |
+
W0917 13:26:49.065000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 212641 closing signal SIGTERM
|
| 643 |
+
E0917 13:26:50.209000 212569 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 212635) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
|
| 644 |
+
Traceback (most recent call last):
|
| 645 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
| 646 |
+
return _run_code(code, main_globals, None,
|
| 647 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
|
| 648 |
+
exec(code, run_globals)
|
| 649 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
|
| 650 |
+
main()
|
| 651 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
|
| 652 |
+
return f(*args, **kwargs)
|
| 653 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
|
| 654 |
+
run(args)
|
| 655 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
|
| 656 |
+
elastic_launch(
|
| 657 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
|
| 658 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 659 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
|
| 660 |
+
raise ChildFailedError(
|
| 661 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 662 |
+
============================================================
|
| 663 |
+
/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
|
| 664 |
+
------------------------------------------------------------
|
| 665 |
+
Failures:
|
| 666 |
+
<NO_OTHER_FAILURES>
|
| 667 |
+
------------------------------------------------------------
|
| 668 |
+
Root Cause (first observed failure):
|
| 669 |
+
[0]:
|
| 670 |
+
time : 2025-09-17_13:26:49
|
| 671 |
+
host : TENCENT64.site
|
| 672 |
+
rank : 1 (local_rank: 1)
|
| 673 |
+
exitcode : 1 (pid: 212635)
|
| 674 |
+
error_file: <N/A>
|
| 675 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 676 |
+
============================================================
|
log/20250917-13:35:09.log
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
|
| 2 |
+
|
| 3 |
+
*****************************************
|
| 4 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 5 |
+
*****************************************
|
| 6 |
+
[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
|
| 7 |
+
[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
|
| 8 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 9 |
+
[2025-09-17 13:35:23,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 10 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 11 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 12 |
+
[2025-09-17 13:35:24,740] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 13 |
+
[2025-09-17 13:35:24,749] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 14 |
+
[2025-09-17 13:35:25,072] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 15 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 16 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 17 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 18 |
+
[INFO:swift] Setting args.lazy_tokenize: False
|
| 19 |
+
[2025-09-17 13:35:26,421] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 20 |
+
[2025-09-17 13:35:26,430] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 21 |
+
[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
|
| 22 |
+
[2025-09-17 13:35:26,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 23 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 24 |
+
[2025-09-17 13:35:28,235] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 25 |
+
[2025-09-17 13:35:28,244] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 26 |
+
[2025-09-17 13:35:28,244] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 27 |
+
[2025-09-17 13:35:28,524] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 28 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 29 |
+
[2025-09-17 13:35:29,859] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 30 |
+
[2025-09-17 13:35:29,867] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 31 |
+
[2025-09-17 13:35:30,411] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 32 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 33 |
+
[2025-09-17 13:35:32,085] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 34 |
+
[2025-09-17 13:35:32,094] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 35 |
+
[2025-09-17 13:35:32,252] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 36 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 37 |
+
[2025-09-17 13:35:33,593] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 38 |
+
[2025-09-17 13:35:33,600] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 39 |
+
[2025-09-17 13:35:33,602] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 40 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 41 |
+
[2025-09-17 13:35:34,920] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 42 |
+
[2025-09-17 13:35:34,928] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 43 |
+
[2025-09-17 13:35:35,271] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 44 |
+
[2025-09-17 13:35:36,578] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 45 |
+
[2025-09-17 13:35:36,586] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 46 |
+
[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538
|
| 47 |
+
[INFO:swift] Global seed set to 42
|
| 48 |
+
[INFO:swift] args: TrainArguments(
|
| 49 |
+
_n_gpu=-1,
|
| 50 |
+
acc_strategy=token,
|
| 51 |
+
accelerator_config={'dispatch_batches': False},
|
| 52 |
+
adafactor=False,
|
| 53 |
+
adalora_beta1=0.85,
|
| 54 |
+
adalora_beta2=0.85,
|
| 55 |
+
adalora_deltaT=1,
|
| 56 |
+
adalora_init_r=12,
|
| 57 |
+
adalora_orth_reg_weight=0.5,
|
| 58 |
+
adalora_target_r=8,
|
| 59 |
+
adalora_tfinal=0,
|
| 60 |
+
adalora_tinit=0,
|
| 61 |
+
adam_beta1=0.9,
|
| 62 |
+
adam_beta2=0.95,
|
| 63 |
+
adam_epsilon=1e-08,
|
| 64 |
+
adapter_act=gelu,
|
| 65 |
+
adapter_length=128,
|
| 66 |
+
adapters=[],
|
| 67 |
+
add_version=True,
|
| 68 |
+
agent_template=None,
|
| 69 |
+
aligner_lr=None,
|
| 70 |
+
attn_impl=None,
|
| 71 |
+
auto_find_batch_size=False,
|
| 72 |
+
average_tokens_across_devices=True,
|
| 73 |
+
batch_eval_metrics=False,
|
| 74 |
+
bf16=True,
|
| 75 |
+
bf16_full_eval=False,
|
| 76 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 77 |
+
bnb_4bit_quant_storage=None,
|
| 78 |
+
bnb_4bit_quant_type=nf4,
|
| 79 |
+
bnb_4bit_use_double_quant=True,
|
| 80 |
+
boft_block_num=0,
|
| 81 |
+
boft_block_size=4,
|
| 82 |
+
boft_dropout=0.0,
|
| 83 |
+
boft_n_butterfly_factor=1,
|
| 84 |
+
cached_dataset=[],
|
| 85 |
+
channels=None,
|
| 86 |
+
check_model=True,
|
| 87 |
+
ckpt_dir=None,
|
| 88 |
+
columns={},
|
| 89 |
+
create_checkpoint_symlink=False,
|
| 90 |
+
custom_dataset_info=[],
|
| 91 |
+
custom_register_path=[],
|
| 92 |
+
data_seed=42,
|
| 93 |
+
dataloader_drop_last=False,
|
| 94 |
+
dataloader_num_workers=48,
|
| 95 |
+
dataloader_persistent_workers=False,
|
| 96 |
+
dataloader_pin_memory=True,
|
| 97 |
+
dataloader_prefetch_factor=None,
|
| 98 |
+
dataset=['/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
|
| 99 |
+
dataset_num_proc=100,
|
| 100 |
+
dataset_shuffle=True,
|
| 101 |
+
ddp_backend=None,
|
| 102 |
+
ddp_broadcast_buffers=None,
|
| 103 |
+
ddp_bucket_cap_mb=None,
|
| 104 |
+
ddp_find_unused_parameters=None,
|
| 105 |
+
ddp_timeout=18000000,
|
| 106 |
+
debug=None,
|
| 107 |
+
deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
|
| 108 |
+
deepspeed_autotp_size=None,
|
| 109 |
+
device_map=None,
|
| 110 |
+
disable_tqdm=None,
|
| 111 |
+
do_eval=False,
|
| 112 |
+
do_predict=False,
|
| 113 |
+
do_train=False,
|
| 114 |
+
download_mode=reuse_dataset_if_exists,
|
| 115 |
+
ds3_gather_for_generation=True,
|
| 116 |
+
early_stop_interval=None,
|
| 117 |
+
enable_dft_loss=False,
|
| 118 |
+
eval_accumulation_steps=None,
|
| 119 |
+
eval_dataset=[],
|
| 120 |
+
eval_dataset_args=None,
|
| 121 |
+
eval_delay=0,
|
| 122 |
+
eval_do_concat_batches=True,
|
| 123 |
+
eval_generation_config=None,
|
| 124 |
+
eval_limit=None,
|
| 125 |
+
eval_on_start=False,
|
| 126 |
+
eval_steps=2000.0,
|
| 127 |
+
eval_strategy=epoch,
|
| 128 |
+
eval_use_evalscope=False,
|
| 129 |
+
eval_use_gather_object=False,
|
| 130 |
+
external_plugins=[],
|
| 131 |
+
extra_eval_args=None,
|
| 132 |
+
fourier_n_frequency=2000,
|
| 133 |
+
fourier_scaling=300.0,
|
| 134 |
+
fp16=False,
|
| 135 |
+
fp16_backend=auto,
|
| 136 |
+
fp16_full_eval=False,
|
| 137 |
+
fp16_opt_level=O1,
|
| 138 |
+
freeze_aligner=False,
|
| 139 |
+
freeze_llm=False,
|
| 140 |
+
freeze_parameters=[],
|
| 141 |
+
freeze_parameters_ratio=0.0,
|
| 142 |
+
freeze_parameters_regex=None,
|
| 143 |
+
freeze_vit=True,
|
| 144 |
+
fsdp=,
|
| 145 |
+
fsdp_config=None,
|
| 146 |
+
fsdp_min_num_params=0,
|
| 147 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
| 148 |
+
full_determinism=False,
|
| 149 |
+
galore_cos_threshold=0.4,
|
| 150 |
+
galore_gamma_proj=2,
|
| 151 |
+
galore_optim_per_parameter=False,
|
| 152 |
+
galore_proj_bits=4,
|
| 153 |
+
galore_proj_group_size=256,
|
| 154 |
+
galore_proj_quant=False,
|
| 155 |
+
galore_proj_type=std,
|
| 156 |
+
galore_quantization=False,
|
| 157 |
+
galore_queue_size=5,
|
| 158 |
+
galore_rank=128,
|
| 159 |
+
galore_scale=1.0,
|
| 160 |
+
galore_target_modules=None,
|
| 161 |
+
galore_update_proj_gap=50,
|
| 162 |
+
galore_with_embedding=False,
|
| 163 |
+
generation_config=None,
|
| 164 |
+
generation_max_length=None,
|
| 165 |
+
generation_num_beams=None,
|
| 166 |
+
gradient_accumulation_steps=4,
|
| 167 |
+
gradient_checkpointing=True,
|
| 168 |
+
gradient_checkpointing_kwargs=None,
|
| 169 |
+
greater_is_better=False,
|
| 170 |
+
group_by_length=False,
|
| 171 |
+
half_precision_backend=auto,
|
| 172 |
+
hqq_axis=None,
|
| 173 |
+
hub_always_push=False,
|
| 174 |
+
hub_model_id=None,
|
| 175 |
+
hub_private_repo=None,
|
| 176 |
+
hub_revision=None,
|
| 177 |
+
hub_strategy=every_save,
|
| 178 |
+
hub_token=<HUB_TOKEN>,
|
| 179 |
+
ignore_args_error=False,
|
| 180 |
+
ignore_data_skip=False,
|
| 181 |
+
include_for_metrics=[],
|
| 182 |
+
include_inputs_for_metrics=False,
|
| 183 |
+
include_num_input_tokens_seen=False,
|
| 184 |
+
include_tokens_per_second=False,
|
| 185 |
+
init_strategy=None,
|
| 186 |
+
init_weights=True,
|
| 187 |
+
interleave_prob=None,
|
| 188 |
+
jit_mode_eval=False,
|
| 189 |
+
label_names=None,
|
| 190 |
+
label_smoothing_factor=0.0,
|
| 191 |
+
lazy_tokenize=False,
|
| 192 |
+
learning_rate=5e-06,
|
| 193 |
+
length_column_name=length,
|
| 194 |
+
liger_kernel_config=None,
|
| 195 |
+
lisa_activated_layers=0,
|
| 196 |
+
lisa_step_interval=20,
|
| 197 |
+
llamapro_num_groups=None,
|
| 198 |
+
llamapro_num_new_blocks=4,
|
| 199 |
+
load_args=False,
|
| 200 |
+
load_best_model_at_end=False,
|
| 201 |
+
load_data_args=False,
|
| 202 |
+
load_from_cache_file=True,
|
| 203 |
+
local_rank=0,
|
| 204 |
+
local_repo_path=None,
|
| 205 |
+
log_level=passive,
|
| 206 |
+
log_level_replica=warning,
|
| 207 |
+
log_on_each_node=True,
|
| 208 |
+
logging_dir=/group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538/runs,
|
| 209 |
+
logging_first_step=True,
|
| 210 |
+
logging_nan_inf_filter=True,
|
| 211 |
+
logging_steps=1,
|
| 212 |
+
logging_strategy=steps,
|
| 213 |
+
logprobs=False,
|
| 214 |
+
lora_alpha=32,
|
| 215 |
+
lora_bias=none,
|
| 216 |
+
lora_dropout=0.05,
|
| 217 |
+
lora_dtype=None,
|
| 218 |
+
lora_ga_batch_size=2,
|
| 219 |
+
lora_ga_direction=ArB2r,
|
| 220 |
+
lora_ga_iters=2,
|
| 221 |
+
lora_ga_max_length=1024,
|
| 222 |
+
lora_ga_scale=stable,
|
| 223 |
+
lora_ga_stable_gamma=16,
|
| 224 |
+
lora_modules=[],
|
| 225 |
+
lora_rank=8,
|
| 226 |
+
lorap_lr_ratio=None,
|
| 227 |
+
loss_scale=default,
|
| 228 |
+
loss_type=None,
|
| 229 |
+
lr_scheduler_kwargs=None,
|
| 230 |
+
lr_scheduler_type=cosine,
|
| 231 |
+
max_epochs=None,
|
| 232 |
+
max_grad_norm=1.0,
|
| 233 |
+
max_length=16240,
|
| 234 |
+
max_memory={},
|
| 235 |
+
max_model_len=None,
|
| 236 |
+
max_new_tokens=64,
|
| 237 |
+
max_pixels=None,
|
| 238 |
+
max_steps=-1,
|
| 239 |
+
metric=None,
|
| 240 |
+
metric_for_best_model=loss,
|
| 241 |
+
model=Qwen/Qwen2.5-7B-Instruct,
|
| 242 |
+
model_author=None,
|
| 243 |
+
model_kwargs={},
|
| 244 |
+
model_name=None,
|
| 245 |
+
model_revision=None,
|
| 246 |
+
model_type=qwen2_5,
|
| 247 |
+
modules_to_save=[],
|
| 248 |
+
mp_parameters=,
|
| 249 |
+
neftune_noise_alpha=None,
|
| 250 |
+
new_special_tokens=[],
|
| 251 |
+
no_cuda=False,
|
| 252 |
+
norm_bbox=None,
|
| 253 |
+
num_beams=1,
|
| 254 |
+
num_labels=None,
|
| 255 |
+
num_train_epochs=2.0,
|
| 256 |
+
optim=adamw_torch_fused,
|
| 257 |
+
optim_args=None,
|
| 258 |
+
optim_target_modules=None,
|
| 259 |
+
optimizer=None,
|
| 260 |
+
output_dir=/group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538,
|
| 261 |
+
overwrite_output_dir=False,
|
| 262 |
+
packing=False,
|
| 263 |
+
packing_length=None,
|
| 264 |
+
padding_free=False,
|
| 265 |
+
padding_side=right,
|
| 266 |
+
past_index=-1,
|
| 267 |
+
per_device_eval_batch_size=1,
|
| 268 |
+
per_device_train_batch_size=2,
|
| 269 |
+
predict_with_generate=False,
|
| 270 |
+
prediction_loss_only=False,
|
| 271 |
+
problem_type=None,
|
| 272 |
+
push_to_hub=False,
|
| 273 |
+
push_to_hub_model_id=None,
|
| 274 |
+
push_to_hub_organization=None,
|
| 275 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
| 276 |
+
quant_bits=None,
|
| 277 |
+
quant_method=None,
|
| 278 |
+
ray_scope=last,
|
| 279 |
+
reft_args=None,
|
| 280 |
+
reft_intervention_type=LoreftIntervention,
|
| 281 |
+
reft_layer_key=None,
|
| 282 |
+
reft_layers=None,
|
| 283 |
+
reft_rank=4,
|
| 284 |
+
remove_unused_columns=True,
|
| 285 |
+
repetition_penalty=None,
|
| 286 |
+
report_to=['tensorboard'],
|
| 287 |
+
response_prefix=None,
|
| 288 |
+
restore_callback_states_from_checkpoint=False,
|
| 289 |
+
resume_from_checkpoint=None,
|
| 290 |
+
resume_only_model=False,
|
| 291 |
+
rope_scaling=None,
|
| 292 |
+
router_aux_loss_coef=0.0,
|
| 293 |
+
run_name=/group/40143/hongzhuyi/ms-swift/output/v2-20250917-133538,
|
| 294 |
+
save_on_each_node=False,
|
| 295 |
+
save_only_model=False,
|
| 296 |
+
save_safetensors=True,
|
| 297 |
+
save_steps=500,
|
| 298 |
+
save_strategy=epoch,
|
| 299 |
+
save_total_limit=None,
|
| 300 |
+
seed=42,
|
| 301 |
+
sequence_parallel_size=1,
|
| 302 |
+
shuffle_buffer_size=1000,
|
| 303 |
+
skip_memory_metrics=True,
|
| 304 |
+
sortish_sampler=False,
|
| 305 |
+
split_dataset_ratio=0.001,
|
| 306 |
+
stop_words=[],
|
| 307 |
+
stopping_strategy=first_exhausted,
|
| 308 |
+
stream=False,
|
| 309 |
+
streaming=False,
|
| 310 |
+
strict=False,
|
| 311 |
+
swanlab_exp_name=None,
|
| 312 |
+
swanlab_lark_secret=None,
|
| 313 |
+
swanlab_lark_webhook_url=None,
|
| 314 |
+
swanlab_mode=cloud,
|
| 315 |
+
swanlab_project=None,
|
| 316 |
+
swanlab_token=<SWANLAB_TOKEN>,
|
| 317 |
+
swanlab_workspace=None,
|
| 318 |
+
system=None,
|
| 319 |
+
target_modules=['all-linear'],
|
| 320 |
+
target_regex=None,
|
| 321 |
+
task_type=causal_lm,
|
| 322 |
+
temperature=0.0,
|
| 323 |
+
template=qwen2_5,
|
| 324 |
+
template_backend=swift,
|
| 325 |
+
tf32=None,
|
| 326 |
+
top_k=None,
|
| 327 |
+
top_logprobs=None,
|
| 328 |
+
top_p=None,
|
| 329 |
+
torch_compile=False,
|
| 330 |
+
torch_compile_backend=None,
|
| 331 |
+
torch_compile_mode=None,
|
| 332 |
+
torch_dtype=torch.bfloat16,
|
| 333 |
+
torch_empty_cache_steps=None,
|
| 334 |
+
torchdynamo=None,
|
| 335 |
+
tpu_metrics_debug=False,
|
| 336 |
+
tpu_num_cores=None,
|
| 337 |
+
train_dataloader_shuffle=True,
|
| 338 |
+
train_type=full,
|
| 339 |
+
trainable_parameters=[],
|
| 340 |
+
trainable_parameters_regex=None,
|
| 341 |
+
truncation_strategy=delete,
|
| 342 |
+
tuner_backend=peft,
|
| 343 |
+
use_chat_template=True,
|
| 344 |
+
use_cpu=False,
|
| 345 |
+
use_dora=False,
|
| 346 |
+
use_flash_ckpt=False,
|
| 347 |
+
use_galore=False,
|
| 348 |
+
use_hf=False,
|
| 349 |
+
use_ipex=False,
|
| 350 |
+
use_legacy_prediction_loop=False,
|
| 351 |
+
use_liger_kernel=False,
|
| 352 |
+
use_logits_to_keep=None,
|
| 353 |
+
use_mps_device=False,
|
| 354 |
+
use_rslora=False,
|
| 355 |
+
use_swift_lora=False,
|
| 356 |
+
val_dataset=[],
|
| 357 |
+
val_dataset_shuffle=False,
|
| 358 |
+
vera_d_initial=0.1,
|
| 359 |
+
vera_dropout=0.0,
|
| 360 |
+
vera_projection_prng_key=0,
|
| 361 |
+
vera_rank=256,
|
| 362 |
+
vit_gradient_checkpointing=None,
|
| 363 |
+
vit_lr=None,
|
| 364 |
+
warmup_ratio=0.05,
|
| 365 |
+
warmup_steps=0,
|
| 366 |
+
weight_decay=0.1,
|
| 367 |
+
zero_hpz_partition_size=None,
|
| 368 |
+
)
|
| 369 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 370 |
+
[2025-09-17 13:35:42,156] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 371 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 372 |
+
[2025-09-17 13:35:43,747] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 373 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 374 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 375 |
+
[2025-09-17 13:35:45,419] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 376 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 377 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 378 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 379 |
+
[INFO:swift] model_kwargs: {'device_map': None}
|
| 380 |
+
[2025-09-17 13:35:47,059] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 381 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 382 |
+
[2025-09-17 13:35:48,636] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 383 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 384 |
+
[2025-09-17 13:35:50,272] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 385 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 386 |
+
[2025-09-17 13:35:51,903] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 387 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 388 |
+
[2025-09-17 13:35:53,570] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 389 |
+
[2025-09-17 13:35:53,719] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
[rank3]: Traceback (most recent call last):
|
| 398 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 399 |
+
[rank3]: sft_main()
|
| 400 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 401 |
+
[rank3]: return SwiftSft(args).main()
|
| 402 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 403 |
+
[rank3]: result = self.run()
|
| 404 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 405 |
+
[rank3]: train_dataset, val_dataset = self._prepare_dataset()
|
| 406 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 407 |
+
[rank3]: train_dataset, val_dataset = self._get_dataset()
|
| 408 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 409 |
+
[rank3]: train_dataset, val_dataset = load_dataset(
|
| 410 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 411 |
+
[rank3]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 412 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 413 |
+
[rank3]: dataset = DatasetLoader._load_repo_dataset(
|
| 414 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 415 |
+
[rank3]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 416 |
+
[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 417 |
+
[rank5]: Traceback (most recent call last):
|
| 418 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 419 |
+
[rank5]: sft_main()
|
| 420 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 421 |
+
[rank5]: return SwiftSft(args).main()
|
| 422 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 423 |
+
[rank5]: result = self.run()
|
| 424 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 425 |
+
[rank5]: train_dataset, val_dataset = self._prepare_dataset()
|
| 426 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 427 |
+
[rank5]: train_dataset, val_dataset = self._get_dataset()
|
| 428 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 429 |
+
[rank5]: train_dataset, val_dataset = load_dataset(
|
| 430 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 431 |
+
[rank5]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 432 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 433 |
+
[rank5]: dataset = DatasetLoader._load_repo_dataset(
|
| 434 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 435 |
+
[rank5]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 436 |
+
[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 437 |
+
[rank2]: Traceback (most recent call last):
|
| 438 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 439 |
+
[rank2]: sft_main()
|
| 440 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 441 |
+
[rank2]: return SwiftSft(args).main()
|
| 442 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 443 |
+
[rank2]: result = self.run()
|
| 444 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 445 |
+
[rank2]: train_dataset, val_dataset = self._prepare_dataset()
|
| 446 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 447 |
+
[rank2]: train_dataset, val_dataset = self._get_dataset()
|
| 448 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 449 |
+
[rank2]: train_dataset, val_dataset = load_dataset(
|
| 450 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 451 |
+
[rank2]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 452 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 453 |
+
[rank2]: dataset = DatasetLoader._load_repo_dataset(
|
| 454 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 455 |
+
[rank2]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 456 |
+
[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 457 |
+
[rank7]: Traceback (most recent call last):
|
| 458 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 459 |
+
[rank7]: sft_main()
|
| 460 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 461 |
+
[rank7]: return SwiftSft(args).main()
|
| 462 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 463 |
+
[rank7]: result = self.run()
|
| 464 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 465 |
+
[rank7]: train_dataset, val_dataset = self._prepare_dataset()
|
| 466 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 467 |
+
[rank7]: train_dataset, val_dataset = self._get_dataset()
|
| 468 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 469 |
+
[rank7]: train_dataset, val_dataset = load_dataset(
|
| 470 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 471 |
+
[rank7]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 472 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 473 |
+
[rank7]: dataset = DatasetLoader._load_repo_dataset(
|
| 474 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 475 |
+
[rank7]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 476 |
+
[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 477 |
+
[rank4]: Traceback (most recent call last):
|
| 478 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 479 |
+
[rank4]: sft_main()
|
| 480 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 481 |
+
[rank4]: return SwiftSft(args).main()
|
| 482 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 483 |
+
[rank4]: result = self.run()
|
| 484 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 485 |
+
[rank4]: train_dataset, val_dataset = self._prepare_dataset()
|
| 486 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 487 |
+
[rank4]: train_dataset, val_dataset = self._get_dataset()
|
| 488 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 489 |
+
[rank4]: train_dataset, val_dataset = load_dataset(
|
| 490 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 491 |
+
[rank4]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 492 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 493 |
+
[rank4]: dataset = DatasetLoader._load_repo_dataset(
|
| 494 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 495 |
+
[rank4]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 496 |
+
[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 497 |
+
[rank1]: Traceback (most recent call last):
|
| 498 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 499 |
+
[rank1]: sft_main()
|
| 500 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 501 |
+
[rank1]: return SwiftSft(args).main()
|
| 502 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 503 |
+
[rank1]: result = self.run()
|
| 504 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 505 |
+
[rank1]: train_dataset, val_dataset = self._prepare_dataset()
|
| 506 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 507 |
+
[rank1]: train_dataset, val_dataset = self._get_dataset()
|
| 508 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 509 |
+
[rank1]: train_dataset, val_dataset = load_dataset(
|
| 510 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 511 |
+
[rank1]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 512 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 513 |
+
[rank1]: dataset = DatasetLoader._load_repo_dataset(
|
| 514 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 515 |
+
[rank1]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 516 |
+
[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 517 |
+
[rank6]: Traceback (most recent call last):
|
| 518 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 519 |
+
[rank6]: sft_main()
|
| 520 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 521 |
+
[rank6]: return SwiftSft(args).main()
|
| 522 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 523 |
+
[rank6]: result = self.run()
|
| 524 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 525 |
+
[rank6]: train_dataset, val_dataset = self._prepare_dataset()
|
| 526 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 527 |
+
[rank6]: train_dataset, val_dataset = self._get_dataset()
|
| 528 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 529 |
+
[rank6]: train_dataset, val_dataset = load_dataset(
|
| 530 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 531 |
+
[rank6]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 532 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 533 |
+
[rank6]: dataset = DatasetLoader._load_repo_dataset(
|
| 534 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 535 |
+
[rank6]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 536 |
+
[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 537 |
+
|
| 538 |
+
[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
|
| 539 |
+
"architectures": [
|
| 540 |
+
"Qwen2ForCausalLM"
|
| 541 |
+
],
|
| 542 |
+
"attention_dropout": 0.0,
|
| 543 |
+
"bos_token_id": 151643,
|
| 544 |
+
"eos_token_id": 151645,
|
| 545 |
+
"hidden_act": "silu",
|
| 546 |
+
"hidden_size": 3584,
|
| 547 |
+
"initializer_range": 0.02,
|
| 548 |
+
"intermediate_size": 18944,
|
| 549 |
+
"layer_types": [
|
| 550 |
+
"full_attention",
|
| 551 |
+
"full_attention",
|
| 552 |
+
"full_attention",
|
| 553 |
+
"full_attention",
|
| 554 |
+
"full_attention",
|
| 555 |
+
"full_attention",
|
| 556 |
+
"full_attention",
|
| 557 |
+
"full_attention",
|
| 558 |
+
"full_attention",
|
| 559 |
+
"full_attention",
|
| 560 |
+
"full_attention",
|
| 561 |
+
"full_attention",
|
| 562 |
+
"full_attention",
|
| 563 |
+
"full_attention",
|
| 564 |
+
"full_attention",
|
| 565 |
+
"full_attention",
|
| 566 |
+
"full_attention",
|
| 567 |
+
"full_attention",
|
| 568 |
+
"full_attention",
|
| 569 |
+
"full_attention",
|
| 570 |
+
"full_attention",
|
| 571 |
+
"full_attention",
|
| 572 |
+
"full_attention",
|
| 573 |
+
"full_attention",
|
| 574 |
+
"full_attention",
|
| 575 |
+
"full_attention",
|
| 576 |
+
"full_attention",
|
| 577 |
+
"full_attention"
|
| 578 |
+
],
|
| 579 |
+
"max_position_embeddings": 32768,
|
| 580 |
+
"max_window_layers": 28,
|
| 581 |
+
"model_type": "qwen2",
|
| 582 |
+
"num_attention_heads": 28,
|
| 583 |
+
"num_hidden_layers": 28,
|
| 584 |
+
"num_key_value_heads": 4,
|
| 585 |
+
"pad_token_id": 151643,
|
| 586 |
+
"rms_norm_eps": 1e-06,
|
| 587 |
+
"rope_scaling": null,
|
| 588 |
+
"rope_theta": 1000000.0,
|
| 589 |
+
"sliding_window": null,
|
| 590 |
+
"tie_word_embeddings": false,
|
| 591 |
+
"torch_dtype": "bfloat16",
|
| 592 |
+
"transformers_version": "4.55.4",
|
| 593 |
+
"use_cache": true,
|
| 594 |
+
"use_sliding_window": false,
|
| 595 |
+
"vocab_size": 152064
|
| 596 |
+
}
|
| 597 |
+
, task_type='causal_lm', num_labels=None)
|
| 598 |
+
[INFO:swift] model.generation_config: GenerationConfig {
|
| 599 |
+
"bos_token_id": 151643,
|
| 600 |
+
"eos_token_id": [
|
| 601 |
+
151645,
|
| 602 |
+
151643
|
| 603 |
+
],
|
| 604 |
+
"max_new_tokens": 64,
|
| 605 |
+
"pad_token_id": 151643,
|
| 606 |
+
"repetition_penalty": 1.05
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
|
| 610 |
+
[INFO:swift] max_length: 16240
|
| 611 |
+
[INFO:swift] response_prefix: ''
|
| 612 |
+
[INFO:swift] agent_template: hermes
|
| 613 |
+
[INFO:swift] Start time of running main: 2025-09-17 13:35:55.854352
|
| 614 |
+
[INFO:swift] swift.__version__: 3.8.0.dev0
|
| 615 |
+
[rank0]: Traceback (most recent call last):
|
| 616 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 617 |
+
[rank0]: sft_main()
|
| 618 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 619 |
+
[rank0]: return SwiftSft(args).main()
|
| 620 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 621 |
+
[rank0]: result = self.run()
|
| 622 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 623 |
+
[rank0]: train_dataset, val_dataset = self._prepare_dataset()
|
| 624 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 625 |
+
[rank0]: train_dataset, val_dataset = self._get_dataset()
|
| 626 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 627 |
+
[rank0]: train_dataset, val_dataset = load_dataset(
|
| 628 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 629 |
+
[rank0]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 630 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 631 |
+
[rank0]: dataset = DatasetLoader._load_repo_dataset(
|
| 632 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 633 |
+
[rank0]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 634 |
+
[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_2083q_0.8_swift.jsonl`. os.path.exists(dataset_id): False
|
| 635 |
+
[rank0]:[W917 13:35:56.111200305 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 636 |
+
W0917 13:35:57.757000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214064 closing signal SIGTERM
|
| 637 |
+
W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214065 closing signal SIGTERM
|
| 638 |
+
W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214067 closing signal SIGTERM
|
| 639 |
+
W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214068 closing signal SIGTERM
|
| 640 |
+
W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214069 closing signal SIGTERM
|
| 641 |
+
W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214070 closing signal SIGTERM
|
| 642 |
+
W0917 13:35:57.758000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 214071 closing signal SIGTERM
|
| 643 |
+
E0917 13:35:58.956000 213999 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 2 (pid: 214066) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
|
| 644 |
+
Traceback (most recent call last):
|
| 645 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
| 646 |
+
return _run_code(code, main_globals, None,
|
| 647 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
|
| 648 |
+
exec(code, run_globals)
|
| 649 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
|
| 650 |
+
main()
|
| 651 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
|
| 652 |
+
return f(*args, **kwargs)
|
| 653 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
|
| 654 |
+
run(args)
|
| 655 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
|
| 656 |
+
elastic_launch(
|
| 657 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
|
| 658 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 659 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
|
| 660 |
+
raise ChildFailedError(
|
| 661 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 662 |
+
============================================================
|
| 663 |
+
/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
|
| 664 |
+
------------------------------------------------------------
|
| 665 |
+
Failures:
|
| 666 |
+
<NO_OTHER_FAILURES>
|
| 667 |
+
------------------------------------------------------------
|
| 668 |
+
Root Cause (first observed failure):
|
| 669 |
+
[0]:
|
| 670 |
+
time : 2025-09-17_13:35:57
|
| 671 |
+
host : TENCENT64.site
|
| 672 |
+
rank : 2 (local_rank: 2)
|
| 673 |
+
exitcode : 1 (pid: 214066)
|
| 674 |
+
error_file: <N/A>
|
| 675 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 676 |
+
============================================================
|
log/20250917-13:41:16.log
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
|
| 2 |
+
|
| 3 |
+
*****************************************
|
| 4 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 5 |
+
*****************************************
|
| 6 |
+
[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
|
| 7 |
+
[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
|
| 8 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 9 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 10 |
+
[2025-09-17 13:41:29,202] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 11 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 12 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 13 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 14 |
+
[INFO:swift] Setting args.lazy_tokenize: False
|
| 15 |
+
[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
|
| 16 |
+
[2025-09-17 13:41:30,559] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 17 |
+
[2025-09-17 13:41:30,567] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 18 |
+
[2025-09-17 13:41:30,594] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 19 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 20 |
+
[2025-09-17 13:41:31,893] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 21 |
+
[2025-09-17 13:41:31,902] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 22 |
+
[2025-09-17 13:41:31,902] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 23 |
+
[2025-09-17 13:41:32,524] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 24 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 25 |
+
[2025-09-17 13:41:34,115] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 26 |
+
[2025-09-17 13:41:34,123] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 27 |
+
[2025-09-17 13:41:34,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 28 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 29 |
+
[2025-09-17 13:41:35,568] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 30 |
+
[2025-09-17 13:41:35,577] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 31 |
+
[2025-09-17 13:41:35,847] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 32 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 33 |
+
[2025-09-17 13:41:37,286] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 34 |
+
[2025-09-17 13:41:37,294] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 35 |
+
[2025-09-17 13:41:37,640] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 36 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 37 |
+
[2025-09-17 13:41:38,974] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 38 |
+
[2025-09-17 13:41:38,986] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 39 |
+
[2025-09-17 13:41:38,995] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 40 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 41 |
+
[2025-09-17 13:41:40,341] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 42 |
+
[2025-09-17 13:41:40,350] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 43 |
+
[2025-09-17 13:41:40,690] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 44 |
+
[2025-09-17 13:41:41,989] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 45 |
+
[2025-09-17 13:41:41,999] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 46 |
+
[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144
|
| 47 |
+
[INFO:swift] Global seed set to 42
|
| 48 |
+
[INFO:swift] args: TrainArguments(
|
| 49 |
+
_n_gpu=-1,
|
| 50 |
+
acc_strategy=token,
|
| 51 |
+
accelerator_config={'dispatch_batches': False},
|
| 52 |
+
adafactor=False,
|
| 53 |
+
adalora_beta1=0.85,
|
| 54 |
+
adalora_beta2=0.85,
|
| 55 |
+
adalora_deltaT=1,
|
| 56 |
+
adalora_init_r=12,
|
| 57 |
+
adalora_orth_reg_weight=0.5,
|
| 58 |
+
adalora_target_r=8,
|
| 59 |
+
adalora_tfinal=0,
|
| 60 |
+
adalora_tinit=0,
|
| 61 |
+
adam_beta1=0.9,
|
| 62 |
+
adam_beta2=0.95,
|
| 63 |
+
adam_epsilon=1e-08,
|
| 64 |
+
adapter_act=gelu,
|
| 65 |
+
adapter_length=128,
|
| 66 |
+
adapters=[],
|
| 67 |
+
add_version=True,
|
| 68 |
+
agent_template=None,
|
| 69 |
+
aligner_lr=None,
|
| 70 |
+
attn_impl=None,
|
| 71 |
+
auto_find_batch_size=False,
|
| 72 |
+
average_tokens_across_devices=True,
|
| 73 |
+
batch_eval_metrics=False,
|
| 74 |
+
bf16=True,
|
| 75 |
+
bf16_full_eval=False,
|
| 76 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 77 |
+
bnb_4bit_quant_storage=None,
|
| 78 |
+
bnb_4bit_quant_type=nf4,
|
| 79 |
+
bnb_4bit_use_double_quant=True,
|
| 80 |
+
boft_block_num=0,
|
| 81 |
+
boft_block_size=4,
|
| 82 |
+
boft_dropout=0.0,
|
| 83 |
+
boft_n_butterfly_factor=1,
|
| 84 |
+
cached_dataset=[],
|
| 85 |
+
channels=None,
|
| 86 |
+
check_model=True,
|
| 87 |
+
ckpt_dir=None,
|
| 88 |
+
columns={},
|
| 89 |
+
create_checkpoint_symlink=False,
|
| 90 |
+
custom_dataset_info=[],
|
| 91 |
+
custom_register_path=[],
|
| 92 |
+
data_seed=42,
|
| 93 |
+
dataloader_drop_last=False,
|
| 94 |
+
dataloader_num_workers=48,
|
| 95 |
+
dataloader_persistent_workers=False,
|
| 96 |
+
dataloader_pin_memory=True,
|
| 97 |
+
dataloader_prefetch_factor=None,
|
| 98 |
+
dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
|
| 99 |
+
dataset_num_proc=100,
|
| 100 |
+
dataset_shuffle=True,
|
| 101 |
+
ddp_backend=None,
|
| 102 |
+
ddp_broadcast_buffers=None,
|
| 103 |
+
ddp_bucket_cap_mb=None,
|
| 104 |
+
ddp_find_unused_parameters=None,
|
| 105 |
+
ddp_timeout=18000000,
|
| 106 |
+
debug=None,
|
| 107 |
+
deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
|
| 108 |
+
deepspeed_autotp_size=None,
|
| 109 |
+
device_map=None,
|
| 110 |
+
disable_tqdm=None,
|
| 111 |
+
do_eval=False,
|
| 112 |
+
do_predict=False,
|
| 113 |
+
do_train=False,
|
| 114 |
+
download_mode=reuse_dataset_if_exists,
|
| 115 |
+
ds3_gather_for_generation=True,
|
| 116 |
+
early_stop_interval=None,
|
| 117 |
+
enable_dft_loss=False,
|
| 118 |
+
eval_accumulation_steps=None,
|
| 119 |
+
eval_dataset=[],
|
| 120 |
+
eval_dataset_args=None,
|
| 121 |
+
eval_delay=0,
|
| 122 |
+
eval_do_concat_batches=True,
|
| 123 |
+
eval_generation_config=None,
|
| 124 |
+
eval_limit=None,
|
| 125 |
+
eval_on_start=False,
|
| 126 |
+
eval_steps=2000.0,
|
| 127 |
+
eval_strategy=epoch,
|
| 128 |
+
eval_use_evalscope=False,
|
| 129 |
+
eval_use_gather_object=False,
|
| 130 |
+
external_plugins=[],
|
| 131 |
+
extra_eval_args=None,
|
| 132 |
+
fourier_n_frequency=2000,
|
| 133 |
+
fourier_scaling=300.0,
|
| 134 |
+
fp16=False,
|
| 135 |
+
fp16_backend=auto,
|
| 136 |
+
fp16_full_eval=False,
|
| 137 |
+
fp16_opt_level=O1,
|
| 138 |
+
freeze_aligner=False,
|
| 139 |
+
freeze_llm=False,
|
| 140 |
+
freeze_parameters=[],
|
| 141 |
+
freeze_parameters_ratio=0.0,
|
| 142 |
+
freeze_parameters_regex=None,
|
| 143 |
+
freeze_vit=True,
|
| 144 |
+
fsdp=,
|
| 145 |
+
fsdp_config=None,
|
| 146 |
+
fsdp_min_num_params=0,
|
| 147 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
| 148 |
+
full_determinism=False,
|
| 149 |
+
galore_cos_threshold=0.4,
|
| 150 |
+
galore_gamma_proj=2,
|
| 151 |
+
galore_optim_per_parameter=False,
|
| 152 |
+
galore_proj_bits=4,
|
| 153 |
+
galore_proj_group_size=256,
|
| 154 |
+
galore_proj_quant=False,
|
| 155 |
+
galore_proj_type=std,
|
| 156 |
+
galore_quantization=False,
|
| 157 |
+
galore_queue_size=5,
|
| 158 |
+
galore_rank=128,
|
| 159 |
+
galore_scale=1.0,
|
| 160 |
+
galore_target_modules=None,
|
| 161 |
+
galore_update_proj_gap=50,
|
| 162 |
+
galore_with_embedding=False,
|
| 163 |
+
generation_config=None,
|
| 164 |
+
generation_max_length=None,
|
| 165 |
+
generation_num_beams=None,
|
| 166 |
+
gradient_accumulation_steps=4,
|
| 167 |
+
gradient_checkpointing=True,
|
| 168 |
+
gradient_checkpointing_kwargs=None,
|
| 169 |
+
greater_is_better=False,
|
| 170 |
+
group_by_length=False,
|
| 171 |
+
half_precision_backend=auto,
|
| 172 |
+
hqq_axis=None,
|
| 173 |
+
hub_always_push=False,
|
| 174 |
+
hub_model_id=None,
|
| 175 |
+
hub_private_repo=None,
|
| 176 |
+
hub_revision=None,
|
| 177 |
+
hub_strategy=every_save,
|
| 178 |
+
hub_token=<HUB_TOKEN>,
|
| 179 |
+
ignore_args_error=False,
|
| 180 |
+
ignore_data_skip=False,
|
| 181 |
+
include_for_metrics=[],
|
| 182 |
+
include_inputs_for_metrics=False,
|
| 183 |
+
include_num_input_tokens_seen=False,
|
| 184 |
+
include_tokens_per_second=False,
|
| 185 |
+
init_strategy=None,
|
| 186 |
+
init_weights=True,
|
| 187 |
+
interleave_prob=None,
|
| 188 |
+
jit_mode_eval=False,
|
| 189 |
+
label_names=None,
|
| 190 |
+
label_smoothing_factor=0.0,
|
| 191 |
+
lazy_tokenize=False,
|
| 192 |
+
learning_rate=5e-06,
|
| 193 |
+
length_column_name=length,
|
| 194 |
+
liger_kernel_config=None,
|
| 195 |
+
lisa_activated_layers=0,
|
| 196 |
+
lisa_step_interval=20,
|
| 197 |
+
llamapro_num_groups=None,
|
| 198 |
+
llamapro_num_new_blocks=4,
|
| 199 |
+
load_args=False,
|
| 200 |
+
load_best_model_at_end=False,
|
| 201 |
+
load_data_args=False,
|
| 202 |
+
load_from_cache_file=True,
|
| 203 |
+
local_rank=0,
|
| 204 |
+
local_repo_path=None,
|
| 205 |
+
log_level=passive,
|
| 206 |
+
log_level_replica=warning,
|
| 207 |
+
log_on_each_node=True,
|
| 208 |
+
logging_dir=/group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144/runs,
|
| 209 |
+
logging_first_step=True,
|
| 210 |
+
logging_nan_inf_filter=True,
|
| 211 |
+
logging_steps=1,
|
| 212 |
+
logging_strategy=steps,
|
| 213 |
+
logprobs=False,
|
| 214 |
+
lora_alpha=32,
|
| 215 |
+
lora_bias=none,
|
| 216 |
+
lora_dropout=0.05,
|
| 217 |
+
lora_dtype=None,
|
| 218 |
+
lora_ga_batch_size=2,
|
| 219 |
+
lora_ga_direction=ArB2r,
|
| 220 |
+
lora_ga_iters=2,
|
| 221 |
+
lora_ga_max_length=1024,
|
| 222 |
+
lora_ga_scale=stable,
|
| 223 |
+
lora_ga_stable_gamma=16,
|
| 224 |
+
lora_modules=[],
|
| 225 |
+
lora_rank=8,
|
| 226 |
+
lorap_lr_ratio=None,
|
| 227 |
+
loss_scale=default,
|
| 228 |
+
loss_type=None,
|
| 229 |
+
lr_scheduler_kwargs=None,
|
| 230 |
+
lr_scheduler_type=cosine,
|
| 231 |
+
max_epochs=None,
|
| 232 |
+
max_grad_norm=1.0,
|
| 233 |
+
max_length=16240,
|
| 234 |
+
max_memory={},
|
| 235 |
+
max_model_len=None,
|
| 236 |
+
max_new_tokens=64,
|
| 237 |
+
max_pixels=None,
|
| 238 |
+
max_steps=-1,
|
| 239 |
+
metric=None,
|
| 240 |
+
metric_for_best_model=loss,
|
| 241 |
+
model=Qwen/Qwen2.5-7B-Instruct,
|
| 242 |
+
model_author=None,
|
| 243 |
+
model_kwargs={},
|
| 244 |
+
model_name=None,
|
| 245 |
+
model_revision=None,
|
| 246 |
+
model_type=qwen2_5,
|
| 247 |
+
modules_to_save=[],
|
| 248 |
+
mp_parameters=,
|
| 249 |
+
neftune_noise_alpha=None,
|
| 250 |
+
new_special_tokens=[],
|
| 251 |
+
no_cuda=False,
|
| 252 |
+
norm_bbox=None,
|
| 253 |
+
num_beams=1,
|
| 254 |
+
num_labels=None,
|
| 255 |
+
num_train_epochs=2.0,
|
| 256 |
+
optim=adamw_torch_fused,
|
| 257 |
+
optim_args=None,
|
| 258 |
+
optim_target_modules=None,
|
| 259 |
+
optimizer=None,
|
| 260 |
+
output_dir=/group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144,
|
| 261 |
+
overwrite_output_dir=False,
|
| 262 |
+
packing=False,
|
| 263 |
+
packing_length=None,
|
| 264 |
+
padding_free=False,
|
| 265 |
+
padding_side=right,
|
| 266 |
+
past_index=-1,
|
| 267 |
+
per_device_eval_batch_size=1,
|
| 268 |
+
per_device_train_batch_size=2,
|
| 269 |
+
predict_with_generate=False,
|
| 270 |
+
prediction_loss_only=False,
|
| 271 |
+
problem_type=None,
|
| 272 |
+
push_to_hub=False,
|
| 273 |
+
push_to_hub_model_id=None,
|
| 274 |
+
push_to_hub_organization=None,
|
| 275 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
| 276 |
+
quant_bits=None,
|
| 277 |
+
quant_method=None,
|
| 278 |
+
ray_scope=last,
|
| 279 |
+
reft_args=None,
|
| 280 |
+
reft_intervention_type=LoreftIntervention,
|
| 281 |
+
reft_layer_key=None,
|
| 282 |
+
reft_layers=None,
|
| 283 |
+
reft_rank=4,
|
| 284 |
+
remove_unused_columns=True,
|
| 285 |
+
repetition_penalty=None,
|
| 286 |
+
report_to=['tensorboard'],
|
| 287 |
+
response_prefix=None,
|
| 288 |
+
restore_callback_states_from_checkpoint=False,
|
| 289 |
+
resume_from_checkpoint=None,
|
| 290 |
+
resume_only_model=False,
|
| 291 |
+
rope_scaling=None,
|
| 292 |
+
router_aux_loss_coef=0.0,
|
| 293 |
+
run_name=/group/40143/hongzhuyi/ms-swift/output/v3-20250917-134144,
|
| 294 |
+
save_on_each_node=False,
|
| 295 |
+
save_only_model=False,
|
| 296 |
+
save_safetensors=True,
|
| 297 |
+
save_steps=500,
|
| 298 |
+
save_strategy=epoch,
|
| 299 |
+
save_total_limit=None,
|
| 300 |
+
seed=42,
|
| 301 |
+
sequence_parallel_size=1,
|
| 302 |
+
shuffle_buffer_size=1000,
|
| 303 |
+
skip_memory_metrics=True,
|
| 304 |
+
sortish_sampler=False,
|
| 305 |
+
split_dataset_ratio=0.001,
|
| 306 |
+
stop_words=[],
|
| 307 |
+
stopping_strategy=first_exhausted,
|
| 308 |
+
stream=False,
|
| 309 |
+
streaming=False,
|
| 310 |
+
strict=False,
|
| 311 |
+
swanlab_exp_name=None,
|
| 312 |
+
swanlab_lark_secret=None,
|
| 313 |
+
swanlab_lark_webhook_url=None,
|
| 314 |
+
swanlab_mode=cloud,
|
| 315 |
+
swanlab_project=None,
|
| 316 |
+
swanlab_token=<SWANLAB_TOKEN>,
|
| 317 |
+
swanlab_workspace=None,
|
| 318 |
+
system=None,
|
| 319 |
+
target_modules=['all-linear'],
|
| 320 |
+
target_regex=None,
|
| 321 |
+
task_type=causal_lm,
|
| 322 |
+
temperature=0.0,
|
| 323 |
+
template=qwen2_5,
|
| 324 |
+
template_backend=swift,
|
| 325 |
+
tf32=None,
|
| 326 |
+
top_k=None,
|
| 327 |
+
top_logprobs=None,
|
| 328 |
+
top_p=None,
|
| 329 |
+
torch_compile=False,
|
| 330 |
+
torch_compile_backend=None,
|
| 331 |
+
torch_compile_mode=None,
|
| 332 |
+
torch_dtype=torch.bfloat16,
|
| 333 |
+
torch_empty_cache_steps=None,
|
| 334 |
+
torchdynamo=None,
|
| 335 |
+
tpu_metrics_debug=False,
|
| 336 |
+
tpu_num_cores=None,
|
| 337 |
+
train_dataloader_shuffle=True,
|
| 338 |
+
train_type=full,
|
| 339 |
+
trainable_parameters=[],
|
| 340 |
+
trainable_parameters_regex=None,
|
| 341 |
+
truncation_strategy=delete,
|
| 342 |
+
tuner_backend=peft,
|
| 343 |
+
use_chat_template=True,
|
| 344 |
+
use_cpu=False,
|
| 345 |
+
use_dora=False,
|
| 346 |
+
use_flash_ckpt=False,
|
| 347 |
+
use_galore=False,
|
| 348 |
+
use_hf=False,
|
| 349 |
+
use_ipex=False,
|
| 350 |
+
use_legacy_prediction_loop=False,
|
| 351 |
+
use_liger_kernel=False,
|
| 352 |
+
use_logits_to_keep=None,
|
| 353 |
+
use_mps_device=False,
|
| 354 |
+
use_rslora=False,
|
| 355 |
+
use_swift_lora=False,
|
| 356 |
+
val_dataset=[],
|
| 357 |
+
val_dataset_shuffle=False,
|
| 358 |
+
vera_d_initial=0.1,
|
| 359 |
+
vera_dropout=0.0,
|
| 360 |
+
vera_projection_prng_key=0,
|
| 361 |
+
vera_rank=256,
|
| 362 |
+
vit_gradient_checkpointing=None,
|
| 363 |
+
vit_lr=None,
|
| 364 |
+
warmup_ratio=0.05,
|
| 365 |
+
warmup_steps=0,
|
| 366 |
+
weight_decay=0.1,
|
| 367 |
+
zero_hpz_partition_size=None,
|
| 368 |
+
)
|
| 369 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 370 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 371 |
+
[2025-09-17 13:41:47,308] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 372 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 373 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 374 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 375 |
+
[INFO:swift] model_kwargs: {'device_map': None}
|
| 376 |
+
[2025-09-17 13:41:48,909] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 377 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 378 |
+
[2025-09-17 13:41:50,527] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 379 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 380 |
+
[2025-09-17 13:41:52,236] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 381 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 382 |
+
[2025-09-17 13:41:53,964] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 383 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 384 |
+
[2025-09-17 13:41:55,616] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 385 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 386 |
+
[2025-09-17 13:41:57,230] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 387 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 388 |
+
[2025-09-17 13:41:58,901] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 389 |
+
[2025-09-17 13:41:59,122] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
|
| 399 |
+
"architectures": [
|
| 400 |
+
"Qwen2ForCausalLM"
|
| 401 |
+
],
|
| 402 |
+
"attention_dropout": 0.0,
|
| 403 |
+
"bos_token_id": 151643,
|
| 404 |
+
"eos_token_id": 151645,
|
| 405 |
+
"hidden_act": "silu",
|
| 406 |
+
"hidden_size": 3584,
|
| 407 |
+
"initializer_range": 0.02,
|
| 408 |
+
"intermediate_size": 18944,
|
| 409 |
+
"layer_types": [
|
| 410 |
+
"full_attention",
|
| 411 |
+
"full_attention",
|
| 412 |
+
"full_attention",
|
| 413 |
+
"full_attention",
|
| 414 |
+
"full_attention",
|
| 415 |
+
"full_attention",
|
| 416 |
+
"full_attention",
|
| 417 |
+
"full_attention",
|
| 418 |
+
"full_attention",
|
| 419 |
+
"full_attention",
|
| 420 |
+
"full_attention",
|
| 421 |
+
"full_attention",
|
| 422 |
+
"full_attention",
|
| 423 |
+
"full_attention",
|
| 424 |
+
"full_attention",
|
| 425 |
+
"full_attention",
|
| 426 |
+
"full_attention",
|
| 427 |
+
"full_attention",
|
| 428 |
+
"full_attention",
|
| 429 |
+
"full_attention",
|
| 430 |
+
"full_attention",
|
| 431 |
+
"full_attention",
|
| 432 |
+
"full_attention",
|
| 433 |
+
"full_attention",
|
| 434 |
+
"full_attention",
|
| 435 |
+
"full_attention",
|
| 436 |
+
"full_attention",
|
| 437 |
+
"full_attention"
|
| 438 |
+
],
|
| 439 |
+
"max_position_embeddings": 32768,
|
| 440 |
+
"max_window_layers": 28,
|
| 441 |
+
"model_type": "qwen2",
|
| 442 |
+
"num_attention_heads": 28,
|
| 443 |
+
"num_hidden_layers": 28,
|
| 444 |
+
"num_key_value_heads": 4,
|
| 445 |
+
"pad_token_id": 151643,
|
| 446 |
+
"rms_norm_eps": 1e-06,
|
| 447 |
+
"rope_scaling": null,
|
| 448 |
+
"rope_theta": 1000000.0,
|
| 449 |
+
"sliding_window": null,
|
| 450 |
+
"tie_word_embeddings": false,
|
| 451 |
+
"torch_dtype": "bfloat16",
|
| 452 |
+
"transformers_version": "4.55.4",
|
| 453 |
+
"use_cache": true,
|
| 454 |
+
"use_sliding_window": false,
|
| 455 |
+
"vocab_size": 152064
|
| 456 |
+
}
|
| 457 |
+
, task_type='causal_lm', num_labels=None)
|
| 458 |
+
[INFO:swift] model.generation_config: GenerationConfig {
|
| 459 |
+
"bos_token_id": 151643,
|
| 460 |
+
"eos_token_id": [
|
| 461 |
+
151645,
|
| 462 |
+
151643
|
| 463 |
+
],
|
| 464 |
+
"max_new_tokens": 64,
|
| 465 |
+
"pad_token_id": 151643,
|
| 466 |
+
"repetition_penalty": 1.05
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
|
| 470 |
+
[INFO:swift] max_length: 16240
|
| 471 |
+
[INFO:swift] response_prefix: ''
|
| 472 |
+
[INFO:swift] agent_template: hermes
|
| 473 |
+
[INFO:swift] Start time of running main: 2025-09-17 13:42:01.223333
|
| 474 |
+
[INFO:swift] swift.__version__: 3.8.0.dev0
|
| 475 |
+
[rank0]: Traceback (most recent call last):
|
| 476 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 477 |
+
[rank0]: sft_main()
|
| 478 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 479 |
+
[rank0]: return SwiftSft(args).main()
|
| 480 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 481 |
+
[rank0]: result = self.run()
|
| 482 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 483 |
+
[rank0]: train_dataset, val_dataset = self._prepare_dataset()
|
| 484 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 485 |
+
[rank0]: train_dataset, val_dataset = self._get_dataset()
|
| 486 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 487 |
+
[rank0]: train_dataset, val_dataset = load_dataset(
|
| 488 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 489 |
+
[rank0]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 490 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 491 |
+
[rank0]: dataset = DatasetLoader._load_repo_dataset(
|
| 492 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 493 |
+
[rank0]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 494 |
+
[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 495 |
+
[rank3]: Traceback (most recent call last):
|
| 496 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 497 |
+
[rank3]: sft_main()
|
| 498 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 499 |
+
[rank3]: return SwiftSft(args).main()
|
| 500 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 501 |
+
[rank3]: result = self.run()
|
| 502 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 503 |
+
[rank3]: train_dataset, val_dataset = self._prepare_dataset()
|
| 504 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 505 |
+
[rank3]: train_dataset, val_dataset = self._get_dataset()
|
| 506 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 507 |
+
[rank3]: train_dataset, val_dataset = load_dataset(
|
| 508 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 509 |
+
[rank3]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 510 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 511 |
+
[rank3]: dataset = DatasetLoader._load_repo_dataset(
|
| 512 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 513 |
+
[rank3]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 514 |
+
[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 515 |
+
[rank2]: Traceback (most recent call last):
|
| 516 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 517 |
+
[rank2]: sft_main()
|
| 518 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 519 |
+
[rank2]: return SwiftSft(args).main()
|
| 520 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 521 |
+
[rank2]: result = self.run()
|
| 522 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 523 |
+
[rank2]: train_dataset, val_dataset = self._prepare_dataset()
|
| 524 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 525 |
+
[rank2]: train_dataset, val_dataset = self._get_dataset()
|
| 526 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 527 |
+
[rank2]: train_dataset, val_dataset = load_dataset(
|
| 528 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 529 |
+
[rank2]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 530 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 531 |
+
[rank2]: dataset = DatasetLoader._load_repo_dataset(
|
| 532 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 533 |
+
[rank2]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 534 |
+
[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 535 |
+
[rank6]: Traceback (most recent call last):
|
| 536 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 537 |
+
[rank6]: sft_main()
|
| 538 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 539 |
+
[rank6]: return SwiftSft(args).main()
|
| 540 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 541 |
+
[rank6]: result = self.run()
|
| 542 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 543 |
+
[rank6]: train_dataset, val_dataset = self._prepare_dataset()
|
| 544 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 545 |
+
[rank6]: train_dataset, val_dataset = self._get_dataset()
|
| 546 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 547 |
+
[rank6]: train_dataset, val_dataset = load_dataset(
|
| 548 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 549 |
+
[rank6]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 550 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 551 |
+
[rank6]: dataset = DatasetLoader._load_repo_dataset(
|
| 552 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 553 |
+
[rank6]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 554 |
+
[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 555 |
+
[rank7]: Traceback (most recent call last):
|
| 556 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 557 |
+
[rank7]: sft_main()
|
| 558 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 559 |
+
[rank7]: return SwiftSft(args).main()
|
| 560 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 561 |
+
[rank7]: result = self.run()
|
| 562 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 563 |
+
[rank7]: train_dataset, val_dataset = self._prepare_dataset()
|
| 564 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 565 |
+
[rank7]: train_dataset, val_dataset = self._get_dataset()
|
| 566 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 567 |
+
[rank7]: train_dataset, val_dataset = load_dataset(
|
| 568 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 569 |
+
[rank7]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 570 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 571 |
+
[rank7]: dataset = DatasetLoader._load_repo_dataset(
|
| 572 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 573 |
+
[rank7]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 574 |
+
[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 575 |
+
[rank5]: Traceback (most recent call last):
|
| 576 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 577 |
+
[rank5]: sft_main()
|
| 578 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 579 |
+
[rank5]: return SwiftSft(args).main()
|
| 580 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 581 |
+
[rank5]: result = self.run()
|
| 582 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 583 |
+
[rank5]: train_dataset, val_dataset = self._prepare_dataset()
|
| 584 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 585 |
+
[rank5]: train_dataset, val_dataset = self._get_dataset()
|
| 586 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 587 |
+
[rank5]: train_dataset, val_dataset = load_dataset(
|
| 588 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 589 |
+
[rank5]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 590 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 591 |
+
[rank5]: dataset = DatasetLoader._load_repo_dataset(
|
| 592 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 593 |
+
[rank5]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 594 |
+
[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 595 |
+
[rank4]: Traceback (most recent call last):
|
| 596 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 597 |
+
[rank4]: sft_main()
|
| 598 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 599 |
+
[rank4]: return SwiftSft(args).main()
|
| 600 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 601 |
+
[rank4]: result = self.run()
|
| 602 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 603 |
+
[rank4]: train_dataset, val_dataset = self._prepare_dataset()
|
| 604 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 605 |
+
[rank4]: train_dataset, val_dataset = self._get_dataset()
|
| 606 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 607 |
+
[rank4]: train_dataset, val_dataset = load_dataset(
|
| 608 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 609 |
+
[rank4]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 610 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 611 |
+
[rank4]: dataset = DatasetLoader._load_repo_dataset(
|
| 612 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 613 |
+
[rank4]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 614 |
+
[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 615 |
+
[rank1]: Traceback (most recent call last):
|
| 616 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 617 |
+
[rank1]: sft_main()
|
| 618 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 619 |
+
[rank1]: return SwiftSft(args).main()
|
| 620 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 621 |
+
[rank1]: result = self.run()
|
| 622 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 623 |
+
[rank1]: train_dataset, val_dataset = self._prepare_dataset()
|
| 624 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 625 |
+
[rank1]: train_dataset, val_dataset = self._get_dataset()
|
| 626 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 627 |
+
[rank1]: train_dataset, val_dataset = load_dataset(
|
| 628 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 629 |
+
[rank1]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 630 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 631 |
+
[rank1]: dataset = DatasetLoader._load_repo_dataset(
|
| 632 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 633 |
+
[rank1]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 634 |
+
[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 635 |
+
[rank0]:[W917 13:42:04.042692069 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 636 |
+
W0917 13:42:05.588000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215461 closing signal SIGTERM
|
| 637 |
+
W0917 13:42:05.589000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215462 closing signal SIGTERM
|
| 638 |
+
W0917 13:42:05.590000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215464 closing signal SIGTERM
|
| 639 |
+
W0917 13:42:05.590000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215465 closing signal SIGTERM
|
| 640 |
+
W0917 13:42:05.591000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215466 closing signal SIGTERM
|
| 641 |
+
W0917 13:42:05.592000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215467 closing signal SIGTERM
|
| 642 |
+
W0917 13:42:05.592000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 215468 closing signal SIGTERM
|
| 643 |
+
E0917 13:42:06.625000 215396 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 2 (pid: 215463) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
|
| 644 |
+
Traceback (most recent call last):
|
| 645 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
| 646 |
+
return _run_code(code, main_globals, None,
|
| 647 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
|
| 648 |
+
exec(code, run_globals)
|
| 649 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
|
| 650 |
+
main()
|
| 651 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
|
| 652 |
+
return f(*args, **kwargs)
|
| 653 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
|
| 654 |
+
run(args)
|
| 655 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
|
| 656 |
+
elastic_launch(
|
| 657 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
|
| 658 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 659 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
|
| 660 |
+
raise ChildFailedError(
|
| 661 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 662 |
+
============================================================
|
| 663 |
+
/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
|
| 664 |
+
------------------------------------------------------------
|
| 665 |
+
Failures:
|
| 666 |
+
<NO_OTHER_FAILURES>
|
| 667 |
+
------------------------------------------------------------
|
| 668 |
+
Root Cause (first observed failure):
|
| 669 |
+
[0]:
|
| 670 |
+
time : 2025-09-17_13:42:05
|
| 671 |
+
host : TENCENT64.site
|
| 672 |
+
rank : 2 (local_rank: 2)
|
| 673 |
+
exitcode : 1 (pid: 215463)
|
| 674 |
+
error_file: <N/A>
|
| 675 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 676 |
+
============================================================
|
log/20250917-13:44:32.log
ADDED
|
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
|
| 2 |
+
|
| 3 |
+
*****************************************
|
| 4 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 5 |
+
*****************************************
|
| 6 |
+
[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
|
| 7 |
+
[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
|
| 8 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 9 |
+
[2025-09-17 13:44:46,353] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 10 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 11 |
+
[2025-09-17 13:44:47,709] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 12 |
+
[2025-09-17 13:44:47,717] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 13 |
+
[2025-09-17 13:44:47,876] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 14 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 15 |
+
[2025-09-17 13:44:49,192] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 16 |
+
[2025-09-17 13:44:49,200] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 17 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 18 |
+
[2025-09-17 13:44:49,878] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 19 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 20 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 21 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 22 |
+
[INFO:swift] Setting args.lazy_tokenize: False
|
| 23 |
+
[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
|
| 24 |
+
[2025-09-17 13:44:51,216] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 25 |
+
[2025-09-17 13:44:51,225] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 26 |
+
[2025-09-17 13:44:51,315] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 27 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 28 |
+
[2025-09-17 13:44:52,668] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 29 |
+
[2025-09-17 13:44:52,676] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 30 |
+
[2025-09-17 13:44:52,676] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 31 |
+
[2025-09-17 13:44:53,216] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 32 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 33 |
+
[2025-09-17 13:44:54,529] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 34 |
+
[2025-09-17 13:44:54,538] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 35 |
+
[2025-09-17 13:44:54,907] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 36 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 37 |
+
[2025-09-17 13:44:56,316] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 38 |
+
[2025-09-17 13:44:56,325] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 39 |
+
[2025-09-17 13:44:56,562] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 40 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 41 |
+
[2025-09-17 13:44:57,878] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 42 |
+
[2025-09-17 13:44:57,886] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 43 |
+
[2025-09-17 13:44:58,181] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 44 |
+
[2025-09-17 13:44:59,565] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 45 |
+
[2025-09-17 13:44:59,574] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 46 |
+
[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501
|
| 47 |
+
[INFO:swift] Global seed set to 42
|
| 48 |
+
[INFO:swift] args: TrainArguments(
|
| 49 |
+
_n_gpu=-1,
|
| 50 |
+
acc_strategy=token,
|
| 51 |
+
accelerator_config={'dispatch_batches': False},
|
| 52 |
+
adafactor=False,
|
| 53 |
+
adalora_beta1=0.85,
|
| 54 |
+
adalora_beta2=0.85,
|
| 55 |
+
adalora_deltaT=1,
|
| 56 |
+
adalora_init_r=12,
|
| 57 |
+
adalora_orth_reg_weight=0.5,
|
| 58 |
+
adalora_target_r=8,
|
| 59 |
+
adalora_tfinal=0,
|
| 60 |
+
adalora_tinit=0,
|
| 61 |
+
adam_beta1=0.9,
|
| 62 |
+
adam_beta2=0.95,
|
| 63 |
+
adam_epsilon=1e-08,
|
| 64 |
+
adapter_act=gelu,
|
| 65 |
+
adapter_length=128,
|
| 66 |
+
adapters=[],
|
| 67 |
+
add_version=True,
|
| 68 |
+
agent_template=None,
|
| 69 |
+
aligner_lr=None,
|
| 70 |
+
attn_impl=None,
|
| 71 |
+
auto_find_batch_size=False,
|
| 72 |
+
average_tokens_across_devices=True,
|
| 73 |
+
batch_eval_metrics=False,
|
| 74 |
+
bf16=True,
|
| 75 |
+
bf16_full_eval=False,
|
| 76 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 77 |
+
bnb_4bit_quant_storage=None,
|
| 78 |
+
bnb_4bit_quant_type=nf4,
|
| 79 |
+
bnb_4bit_use_double_quant=True,
|
| 80 |
+
boft_block_num=0,
|
| 81 |
+
boft_block_size=4,
|
| 82 |
+
boft_dropout=0.0,
|
| 83 |
+
boft_n_butterfly_factor=1,
|
| 84 |
+
cached_dataset=[],
|
| 85 |
+
channels=None,
|
| 86 |
+
check_model=True,
|
| 87 |
+
ckpt_dir=None,
|
| 88 |
+
columns={},
|
| 89 |
+
create_checkpoint_symlink=False,
|
| 90 |
+
custom_dataset_info=[],
|
| 91 |
+
custom_register_path=[],
|
| 92 |
+
data_seed=42,
|
| 93 |
+
dataloader_drop_last=False,
|
| 94 |
+
dataloader_num_workers=48,
|
| 95 |
+
dataloader_persistent_workers=False,
|
| 96 |
+
dataloader_pin_memory=True,
|
| 97 |
+
dataloader_prefetch_factor=None,
|
| 98 |
+
dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
|
| 99 |
+
dataset_num_proc=100,
|
| 100 |
+
dataset_shuffle=True,
|
| 101 |
+
ddp_backend=None,
|
| 102 |
+
ddp_broadcast_buffers=None,
|
| 103 |
+
ddp_bucket_cap_mb=None,
|
| 104 |
+
ddp_find_unused_parameters=None,
|
| 105 |
+
ddp_timeout=18000000,
|
| 106 |
+
debug=None,
|
| 107 |
+
deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
|
| 108 |
+
deepspeed_autotp_size=None,
|
| 109 |
+
device_map=None,
|
| 110 |
+
disable_tqdm=None,
|
| 111 |
+
do_eval=False,
|
| 112 |
+
do_predict=False,
|
| 113 |
+
do_train=False,
|
| 114 |
+
download_mode=reuse_dataset_if_exists,
|
| 115 |
+
ds3_gather_for_generation=True,
|
| 116 |
+
early_stop_interval=None,
|
| 117 |
+
enable_dft_loss=False,
|
| 118 |
+
eval_accumulation_steps=None,
|
| 119 |
+
eval_dataset=[],
|
| 120 |
+
eval_dataset_args=None,
|
| 121 |
+
eval_delay=0,
|
| 122 |
+
eval_do_concat_batches=True,
|
| 123 |
+
eval_generation_config=None,
|
| 124 |
+
eval_limit=None,
|
| 125 |
+
eval_on_start=False,
|
| 126 |
+
eval_steps=2000.0,
|
| 127 |
+
eval_strategy=epoch,
|
| 128 |
+
eval_use_evalscope=False,
|
| 129 |
+
eval_use_gather_object=False,
|
| 130 |
+
external_plugins=[],
|
| 131 |
+
extra_eval_args=None,
|
| 132 |
+
fourier_n_frequency=2000,
|
| 133 |
+
fourier_scaling=300.0,
|
| 134 |
+
fp16=False,
|
| 135 |
+
fp16_backend=auto,
|
| 136 |
+
fp16_full_eval=False,
|
| 137 |
+
fp16_opt_level=O1,
|
| 138 |
+
freeze_aligner=False,
|
| 139 |
+
freeze_llm=False,
|
| 140 |
+
freeze_parameters=[],
|
| 141 |
+
freeze_parameters_ratio=0.0,
|
| 142 |
+
freeze_parameters_regex=None,
|
| 143 |
+
freeze_vit=True,
|
| 144 |
+
fsdp=,
|
| 145 |
+
fsdp_config=None,
|
| 146 |
+
fsdp_min_num_params=0,
|
| 147 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
| 148 |
+
full_determinism=False,
|
| 149 |
+
galore_cos_threshold=0.4,
|
| 150 |
+
galore_gamma_proj=2,
|
| 151 |
+
galore_optim_per_parameter=False,
|
| 152 |
+
galore_proj_bits=4,
|
| 153 |
+
galore_proj_group_size=256,
|
| 154 |
+
galore_proj_quant=False,
|
| 155 |
+
galore_proj_type=std,
|
| 156 |
+
galore_quantization=False,
|
| 157 |
+
galore_queue_size=5,
|
| 158 |
+
galore_rank=128,
|
| 159 |
+
galore_scale=1.0,
|
| 160 |
+
galore_target_modules=None,
|
| 161 |
+
galore_update_proj_gap=50,
|
| 162 |
+
galore_with_embedding=False,
|
| 163 |
+
generation_config=None,
|
| 164 |
+
generation_max_length=None,
|
| 165 |
+
generation_num_beams=None,
|
| 166 |
+
gradient_accumulation_steps=4,
|
| 167 |
+
gradient_checkpointing=True,
|
| 168 |
+
gradient_checkpointing_kwargs=None,
|
| 169 |
+
greater_is_better=False,
|
| 170 |
+
group_by_length=False,
|
| 171 |
+
half_precision_backend=auto,
|
| 172 |
+
hqq_axis=None,
|
| 173 |
+
hub_always_push=False,
|
| 174 |
+
hub_model_id=None,
|
| 175 |
+
hub_private_repo=None,
|
| 176 |
+
hub_revision=None,
|
| 177 |
+
hub_strategy=every_save,
|
| 178 |
+
hub_token=<HUB_TOKEN>,
|
| 179 |
+
ignore_args_error=False,
|
| 180 |
+
ignore_data_skip=False,
|
| 181 |
+
include_for_metrics=[],
|
| 182 |
+
include_inputs_for_metrics=False,
|
| 183 |
+
include_num_input_tokens_seen=False,
|
| 184 |
+
include_tokens_per_second=False,
|
| 185 |
+
init_strategy=None,
|
| 186 |
+
init_weights=True,
|
| 187 |
+
interleave_prob=None,
|
| 188 |
+
jit_mode_eval=False,
|
| 189 |
+
label_names=None,
|
| 190 |
+
label_smoothing_factor=0.0,
|
| 191 |
+
lazy_tokenize=False,
|
| 192 |
+
learning_rate=5e-06,
|
| 193 |
+
length_column_name=length,
|
| 194 |
+
liger_kernel_config=None,
|
| 195 |
+
lisa_activated_layers=0,
|
| 196 |
+
lisa_step_interval=20,
|
| 197 |
+
llamapro_num_groups=None,
|
| 198 |
+
llamapro_num_new_blocks=4,
|
| 199 |
+
load_args=False,
|
| 200 |
+
load_best_model_at_end=False,
|
| 201 |
+
load_data_args=False,
|
| 202 |
+
load_from_cache_file=True,
|
| 203 |
+
local_rank=0,
|
| 204 |
+
local_repo_path=None,
|
| 205 |
+
log_level=passive,
|
| 206 |
+
log_level_replica=warning,
|
| 207 |
+
log_on_each_node=True,
|
| 208 |
+
logging_dir=/group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501/runs,
|
| 209 |
+
logging_first_step=True,
|
| 210 |
+
logging_nan_inf_filter=True,
|
| 211 |
+
logging_steps=1,
|
| 212 |
+
logging_strategy=steps,
|
| 213 |
+
logprobs=False,
|
| 214 |
+
lora_alpha=32,
|
| 215 |
+
lora_bias=none,
|
| 216 |
+
lora_dropout=0.05,
|
| 217 |
+
lora_dtype=None,
|
| 218 |
+
lora_ga_batch_size=2,
|
| 219 |
+
lora_ga_direction=ArB2r,
|
| 220 |
+
lora_ga_iters=2,
|
| 221 |
+
lora_ga_max_length=1024,
|
| 222 |
+
lora_ga_scale=stable,
|
| 223 |
+
lora_ga_stable_gamma=16,
|
| 224 |
+
lora_modules=[],
|
| 225 |
+
lora_rank=8,
|
| 226 |
+
lorap_lr_ratio=None,
|
| 227 |
+
loss_scale=default,
|
| 228 |
+
loss_type=None,
|
| 229 |
+
lr_scheduler_kwargs=None,
|
| 230 |
+
lr_scheduler_type=cosine,
|
| 231 |
+
max_epochs=None,
|
| 232 |
+
max_grad_norm=1.0,
|
| 233 |
+
max_length=16240,
|
| 234 |
+
max_memory={},
|
| 235 |
+
max_model_len=None,
|
| 236 |
+
max_new_tokens=64,
|
| 237 |
+
max_pixels=None,
|
| 238 |
+
max_steps=-1,
|
| 239 |
+
metric=None,
|
| 240 |
+
metric_for_best_model=loss,
|
| 241 |
+
model=Qwen/Qwen2.5-7B-Instruct,
|
| 242 |
+
model_author=None,
|
| 243 |
+
model_kwargs={},
|
| 244 |
+
model_name=None,
|
| 245 |
+
model_revision=None,
|
| 246 |
+
model_type=qwen2_5,
|
| 247 |
+
modules_to_save=[],
|
| 248 |
+
mp_parameters=,
|
| 249 |
+
neftune_noise_alpha=None,
|
| 250 |
+
new_special_tokens=[],
|
| 251 |
+
no_cuda=False,
|
| 252 |
+
norm_bbox=None,
|
| 253 |
+
num_beams=1,
|
| 254 |
+
num_labels=None,
|
| 255 |
+
num_train_epochs=2.0,
|
| 256 |
+
optim=adamw_torch_fused,
|
| 257 |
+
optim_args=None,
|
| 258 |
+
optim_target_modules=None,
|
| 259 |
+
optimizer=None,
|
| 260 |
+
output_dir=/group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501,
|
| 261 |
+
overwrite_output_dir=False,
|
| 262 |
+
packing=False,
|
| 263 |
+
packing_length=None,
|
| 264 |
+
padding_free=False,
|
| 265 |
+
padding_side=right,
|
| 266 |
+
past_index=-1,
|
| 267 |
+
per_device_eval_batch_size=1,
|
| 268 |
+
per_device_train_batch_size=2,
|
| 269 |
+
predict_with_generate=False,
|
| 270 |
+
prediction_loss_only=False,
|
| 271 |
+
problem_type=None,
|
| 272 |
+
push_to_hub=False,
|
| 273 |
+
push_to_hub_model_id=None,
|
| 274 |
+
push_to_hub_organization=None,
|
| 275 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
| 276 |
+
quant_bits=None,
|
| 277 |
+
quant_method=None,
|
| 278 |
+
ray_scope=last,
|
| 279 |
+
reft_args=None,
|
| 280 |
+
reft_intervention_type=LoreftIntervention,
|
| 281 |
+
reft_layer_key=None,
|
| 282 |
+
reft_layers=None,
|
| 283 |
+
reft_rank=4,
|
| 284 |
+
remove_unused_columns=True,
|
| 285 |
+
repetition_penalty=None,
|
| 286 |
+
report_to=['tensorboard'],
|
| 287 |
+
response_prefix=None,
|
| 288 |
+
restore_callback_states_from_checkpoint=False,
|
| 289 |
+
resume_from_checkpoint=None,
|
| 290 |
+
resume_only_model=False,
|
| 291 |
+
rope_scaling=None,
|
| 292 |
+
router_aux_loss_coef=0.0,
|
| 293 |
+
run_name=/group/40143/hongzhuyi/ms-swift/output/v4-20250917-134501,
|
| 294 |
+
save_on_each_node=False,
|
| 295 |
+
save_only_model=False,
|
| 296 |
+
save_safetensors=True,
|
| 297 |
+
save_steps=500,
|
| 298 |
+
save_strategy=epoch,
|
| 299 |
+
save_total_limit=None,
|
| 300 |
+
seed=42,
|
| 301 |
+
sequence_parallel_size=1,
|
| 302 |
+
shuffle_buffer_size=1000,
|
| 303 |
+
skip_memory_metrics=True,
|
| 304 |
+
sortish_sampler=False,
|
| 305 |
+
split_dataset_ratio=0.001,
|
| 306 |
+
stop_words=[],
|
| 307 |
+
stopping_strategy=first_exhausted,
|
| 308 |
+
stream=False,
|
| 309 |
+
streaming=False,
|
| 310 |
+
strict=False,
|
| 311 |
+
swanlab_exp_name=None,
|
| 312 |
+
swanlab_lark_secret=None,
|
| 313 |
+
swanlab_lark_webhook_url=None,
|
| 314 |
+
swanlab_mode=cloud,
|
| 315 |
+
swanlab_project=None,
|
| 316 |
+
swanlab_token=<SWANLAB_TOKEN>,
|
| 317 |
+
swanlab_workspace=None,
|
| 318 |
+
system=None,
|
| 319 |
+
target_modules=['all-linear'],
|
| 320 |
+
target_regex=None,
|
| 321 |
+
task_type=causal_lm,
|
| 322 |
+
temperature=0.0,
|
| 323 |
+
template=qwen2_5,
|
| 324 |
+
template_backend=swift,
|
| 325 |
+
tf32=None,
|
| 326 |
+
top_k=None,
|
| 327 |
+
top_logprobs=None,
|
| 328 |
+
top_p=None,
|
| 329 |
+
torch_compile=False,
|
| 330 |
+
torch_compile_backend=None,
|
| 331 |
+
torch_compile_mode=None,
|
| 332 |
+
torch_dtype=torch.bfloat16,
|
| 333 |
+
torch_empty_cache_steps=None,
|
| 334 |
+
torchdynamo=None,
|
| 335 |
+
tpu_metrics_debug=False,
|
| 336 |
+
tpu_num_cores=None,
|
| 337 |
+
train_dataloader_shuffle=True,
|
| 338 |
+
train_type=full,
|
| 339 |
+
trainable_parameters=[],
|
| 340 |
+
trainable_parameters_regex=None,
|
| 341 |
+
truncation_strategy=delete,
|
| 342 |
+
tuner_backend=peft,
|
| 343 |
+
use_chat_template=True,
|
| 344 |
+
use_cpu=False,
|
| 345 |
+
use_dora=False,
|
| 346 |
+
use_flash_ckpt=False,
|
| 347 |
+
use_galore=False,
|
| 348 |
+
use_hf=False,
|
| 349 |
+
use_ipex=False,
|
| 350 |
+
use_legacy_prediction_loop=False,
|
| 351 |
+
use_liger_kernel=False,
|
| 352 |
+
use_logits_to_keep=None,
|
| 353 |
+
use_mps_device=False,
|
| 354 |
+
use_rslora=False,
|
| 355 |
+
use_swift_lora=False,
|
| 356 |
+
val_dataset=[],
|
| 357 |
+
val_dataset_shuffle=False,
|
| 358 |
+
vera_d_initial=0.1,
|
| 359 |
+
vera_dropout=0.0,
|
| 360 |
+
vera_projection_prng_key=0,
|
| 361 |
+
vera_rank=256,
|
| 362 |
+
vit_gradient_checkpointing=None,
|
| 363 |
+
vit_lr=None,
|
| 364 |
+
warmup_ratio=0.05,
|
| 365 |
+
warmup_steps=0,
|
| 366 |
+
weight_decay=0.1,
|
| 367 |
+
zero_hpz_partition_size=None,
|
| 368 |
+
)
|
| 369 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 370 |
+
[2025-09-17 13:45:05,431] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 371 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 372 |
+
[2025-09-17 13:45:07,082] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 373 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 374 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 375 |
+
[2025-09-17 13:45:08,746] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 376 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 377 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 378 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 379 |
+
[INFO:swift] model_kwargs: {'device_map': None}
|
| 380 |
+
[2025-09-17 13:45:10,258] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 381 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 382 |
+
[2025-09-17 13:45:11,990] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 383 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 384 |
+
[2025-09-17 13:45:13,654] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 385 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 386 |
+
[2025-09-17 13:45:15,393] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 387 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 388 |
+
[2025-09-17 13:45:17,053] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 389 |
+
[2025-09-17 13:45:17,199] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
|
| 399 |
+
"architectures": [
|
| 400 |
+
"Qwen2ForCausalLM"
|
| 401 |
+
],
|
| 402 |
+
"attention_dropout": 0.0,
|
| 403 |
+
"bos_token_id": 151643,
|
| 404 |
+
"eos_token_id": 151645,
|
| 405 |
+
"hidden_act": "silu",
|
| 406 |
+
"hidden_size": 3584,
|
| 407 |
+
"initializer_range": 0.02,
|
| 408 |
+
"intermediate_size": 18944,
|
| 409 |
+
"layer_types": [
|
| 410 |
+
"full_attention",
|
| 411 |
+
"full_attention",
|
| 412 |
+
"full_attention",
|
| 413 |
+
"full_attention",
|
| 414 |
+
"full_attention",
|
| 415 |
+
"full_attention",
|
| 416 |
+
"full_attention",
|
| 417 |
+
"full_attention",
|
| 418 |
+
"full_attention",
|
| 419 |
+
"full_attention",
|
| 420 |
+
"full_attention",
|
| 421 |
+
"full_attention",
|
| 422 |
+
"full_attention",
|
| 423 |
+
"full_attention",
|
| 424 |
+
"full_attention",
|
| 425 |
+
"full_attention",
|
| 426 |
+
"full_attention",
|
| 427 |
+
"full_attention",
|
| 428 |
+
"full_attention",
|
| 429 |
+
"full_attention",
|
| 430 |
+
"full_attention",
|
| 431 |
+
"full_attention",
|
| 432 |
+
"full_attention",
|
| 433 |
+
"full_attention",
|
| 434 |
+
"full_attention",
|
| 435 |
+
"full_attention",
|
| 436 |
+
"full_attention",
|
| 437 |
+
"full_attention"
|
| 438 |
+
],
|
| 439 |
+
"max_position_embeddings": 32768,
|
| 440 |
+
"max_window_layers": 28,
|
| 441 |
+
"model_type": "qwen2",
|
| 442 |
+
"num_attention_heads": 28,
|
| 443 |
+
"num_hidden_layers": 28,
|
| 444 |
+
"num_key_value_heads": 4,
|
| 445 |
+
"pad_token_id": 151643,
|
| 446 |
+
"rms_norm_eps": 1e-06,
|
| 447 |
+
"rope_scaling": null,
|
| 448 |
+
"rope_theta": 1000000.0,
|
| 449 |
+
"sliding_window": null,
|
| 450 |
+
"tie_word_embeddings": false,
|
| 451 |
+
"torch_dtype": "bfloat16",
|
| 452 |
+
"transformers_version": "4.55.4",
|
| 453 |
+
"use_cache": true,
|
| 454 |
+
"use_sliding_window": false,
|
| 455 |
+
"vocab_size": 152064
|
| 456 |
+
}
|
| 457 |
+
, task_type='causal_lm', num_labels=None)
|
| 458 |
+
[INFO:swift] model.generation_config: GenerationConfig {
|
| 459 |
+
"bos_token_id": 151643,
|
| 460 |
+
"eos_token_id": [
|
| 461 |
+
151645,
|
| 462 |
+
151643
|
| 463 |
+
],
|
| 464 |
+
"max_new_tokens": 64,
|
| 465 |
+
"pad_token_id": 151643,
|
| 466 |
+
"repetition_penalty": 1.05
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
|
| 470 |
+
[INFO:swift] max_length: 16240
|
| 471 |
+
[INFO:swift] response_prefix: ''
|
| 472 |
+
[INFO:swift] agent_template: hermes
|
| 473 |
+
[INFO:swift] Start time of running main: 2025-09-17 13:45:19.304293
|
| 474 |
+
[INFO:swift] swift.__version__: 3.8.0.dev0
|
| 475 |
+
[rank0]: Traceback (most recent call last):
|
| 476 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 477 |
+
[rank0]: sft_main()
|
| 478 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 479 |
+
[rank0]: return SwiftSft(args).main()
|
| 480 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 481 |
+
[rank0]: result = self.run()
|
| 482 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 483 |
+
[rank0]: train_dataset, val_dataset = self._prepare_dataset()
|
| 484 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 485 |
+
[rank0]: train_dataset, val_dataset = self._get_dataset()
|
| 486 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 487 |
+
[rank0]: train_dataset, val_dataset = load_dataset(
|
| 488 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 489 |
+
[rank0]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 490 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 491 |
+
[rank0]: dataset = DatasetLoader._load_repo_dataset(
|
| 492 |
+
[rank0]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 493 |
+
[rank0]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 494 |
+
[rank0]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 495 |
+
[rank7]: Traceback (most recent call last):
|
| 496 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 497 |
+
[rank7]: sft_main()
|
| 498 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 499 |
+
[rank7]: return SwiftSft(args).main()
|
| 500 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 501 |
+
[rank7]: result = self.run()
|
| 502 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 503 |
+
[rank7]: train_dataset, val_dataset = self._prepare_dataset()
|
| 504 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 505 |
+
[rank7]: train_dataset, val_dataset = self._get_dataset()
|
| 506 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 507 |
+
[rank7]: train_dataset, val_dataset = load_dataset(
|
| 508 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 509 |
+
[rank7]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 510 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 511 |
+
[rank7]: dataset = DatasetLoader._load_repo_dataset(
|
| 512 |
+
[rank7]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 513 |
+
[rank7]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 514 |
+
[rank7]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 515 |
+
[rank1]: Traceback (most recent call last):
|
| 516 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 517 |
+
[rank1]: sft_main()
|
| 518 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 519 |
+
[rank1]: return SwiftSft(args).main()
|
| 520 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 521 |
+
[rank1]: result = self.run()
|
| 522 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 523 |
+
[rank1]: train_dataset, val_dataset = self._prepare_dataset()
|
| 524 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 525 |
+
[rank1]: train_dataset, val_dataset = self._get_dataset()
|
| 526 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 527 |
+
[rank1]: train_dataset, val_dataset = load_dataset(
|
| 528 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 529 |
+
[rank1]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 530 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 531 |
+
[rank1]: dataset = DatasetLoader._load_repo_dataset(
|
| 532 |
+
[rank1]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 533 |
+
[rank1]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 534 |
+
[rank1]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 535 |
+
[rank4]: Traceback (most recent call last):
|
| 536 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 537 |
+
[rank4]: sft_main()
|
| 538 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 539 |
+
[rank4]: return SwiftSft(args).main()
|
| 540 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 541 |
+
[rank4]: result = self.run()
|
| 542 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 543 |
+
[rank4]: train_dataset, val_dataset = self._prepare_dataset()
|
| 544 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 545 |
+
[rank4]: train_dataset, val_dataset = self._get_dataset()
|
| 546 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 547 |
+
[rank4]: train_dataset, val_dataset = load_dataset(
|
| 548 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 549 |
+
[rank4]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 550 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 551 |
+
[rank4]: dataset = DatasetLoader._load_repo_dataset(
|
| 552 |
+
[rank4]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 553 |
+
[rank4]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 554 |
+
[rank4]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 555 |
+
[rank5]: Traceback (most recent call last):
|
| 556 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 557 |
+
[rank5]: sft_main()
|
| 558 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 559 |
+
[rank5]: return SwiftSft(args).main()
|
| 560 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 561 |
+
[rank5]: result = self.run()
|
| 562 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 563 |
+
[rank5]: train_dataset, val_dataset = self._prepare_dataset()
|
| 564 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 565 |
+
[rank5]: train_dataset, val_dataset = self._get_dataset()
|
| 566 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 567 |
+
[rank5]: train_dataset, val_dataset = load_dataset(
|
| 568 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 569 |
+
[rank5]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 570 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 571 |
+
[rank5]: dataset = DatasetLoader._load_repo_dataset(
|
| 572 |
+
[rank5]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 573 |
+
[rank5]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 574 |
+
[rank5]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 575 |
+
[rank3]: Traceback (most recent call last):
|
| 576 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 577 |
+
[rank3]: sft_main()
|
| 578 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 579 |
+
[rank3]: return SwiftSft(args).main()
|
| 580 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 581 |
+
[rank3]: result = self.run()
|
| 582 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 583 |
+
[rank3]: train_dataset, val_dataset = self._prepare_dataset()
|
| 584 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 585 |
+
[rank3]: train_dataset, val_dataset = self._get_dataset()
|
| 586 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 587 |
+
[rank3]: train_dataset, val_dataset = load_dataset(
|
| 588 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 589 |
+
[rank3]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 590 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 591 |
+
[rank3]: dataset = DatasetLoader._load_repo_dataset(
|
| 592 |
+
[rank3]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 593 |
+
[rank3]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 594 |
+
[rank3]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 595 |
+
[rank2]: Traceback (most recent call last):
|
| 596 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 597 |
+
[rank2]: sft_main()
|
| 598 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 599 |
+
[rank2]: return SwiftSft(args).main()
|
| 600 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 601 |
+
[rank2]: result = self.run()
|
| 602 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 603 |
+
[rank2]: train_dataset, val_dataset = self._prepare_dataset()
|
| 604 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 605 |
+
[rank2]: train_dataset, val_dataset = self._get_dataset()
|
| 606 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 607 |
+
[rank2]: train_dataset, val_dataset = load_dataset(
|
| 608 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 609 |
+
[rank2]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 610 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 611 |
+
[rank2]: dataset = DatasetLoader._load_repo_dataset(
|
| 612 |
+
[rank2]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 613 |
+
[rank2]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 614 |
+
[rank2]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 615 |
+
[rank6]: Traceback (most recent call last):
|
| 616 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py", line 10, in <module>
|
| 617 |
+
[rank6]: sft_main()
|
| 618 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 328, in sft_main
|
| 619 |
+
[rank6]: return SwiftSft(args).main()
|
| 620 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/base.py", line 49, in main
|
| 621 |
+
[rank6]: result = self.run()
|
| 622 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 158, in run
|
| 623 |
+
[rank6]: train_dataset, val_dataset = self._prepare_dataset()
|
| 624 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 117, in _prepare_dataset
|
| 625 |
+
[rank6]: train_dataset, val_dataset = self._get_dataset()
|
| 626 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/train/sft.py", line 71, in _get_dataset
|
| 627 |
+
[rank6]: train_dataset, val_dataset = load_dataset(
|
| 628 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 533, in load_dataset
|
| 629 |
+
[rank6]: train_dataset = load_function(dataset_syntax, dataset_meta, **load_kwargs, use_hf=use_hf)
|
| 630 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 408, in load
|
| 631 |
+
[rank6]: dataset = DatasetLoader._load_repo_dataset(
|
| 632 |
+
[rank6]: File "/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/loader.py", line 249, in _load_repo_dataset
|
| 633 |
+
[rank6]: raise ValueError(f'The local path does not exist, dataset_id: `{dataset_id}`. '
|
| 634 |
+
[rank6]: ValueError: The local path does not exist, dataset_id: `/group/40143/hongzhuyi/ms-swift/corr_hotpot_new1369q_0.8_format_swift.jsonl`. os.path.exists(dataset_id): False
|
| 635 |
+
[rank0]:[W917 13:45:22.970566457 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
| 636 |
+
W0917 13:45:23.570000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216883 closing signal SIGTERM
|
| 637 |
+
W0917 13:45:23.570000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216884 closing signal SIGTERM
|
| 638 |
+
W0917 13:45:23.573000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216885 closing signal SIGTERM
|
| 639 |
+
W0917 13:45:23.575000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216886 closing signal SIGTERM
|
| 640 |
+
W0917 13:45:23.575000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216888 closing signal SIGTERM
|
| 641 |
+
W0917 13:45:23.577000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216889 closing signal SIGTERM
|
| 642 |
+
W0917 13:45:23.577000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 216890 closing signal SIGTERM
|
| 643 |
+
E0917 13:45:24.622000 216818 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 4 (pid: 216887) of binary: /root/miniconda3/envs/ms-swift/bin/python3.10
|
| 644 |
+
Traceback (most recent call last):
|
| 645 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
| 646 |
+
return _run_code(code, main_globals, None,
|
| 647 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/runpy.py", line 86, in _run_code
|
| 648 |
+
exec(code, run_globals)
|
| 649 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
|
| 650 |
+
main()
|
| 651 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
|
| 652 |
+
return f(*args, **kwargs)
|
| 653 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
|
| 654 |
+
run(args)
|
| 655 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
|
| 656 |
+
elastic_launch(
|
| 657 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
|
| 658 |
+
return launch_agent(self._config, self._entrypoint, list(args))
|
| 659 |
+
File "/root/miniconda3/envs/ms-swift/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
|
| 660 |
+
raise ChildFailedError(
|
| 661 |
+
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
| 662 |
+
============================================================
|
| 663 |
+
/group/40143/hongzhuyi/ms-swift/swift/cli/sft.py FAILED
|
| 664 |
+
------------------------------------------------------------
|
| 665 |
+
Failures:
|
| 666 |
+
<NO_OTHER_FAILURES>
|
| 667 |
+
------------------------------------------------------------
|
| 668 |
+
Root Cause (first observed failure):
|
| 669 |
+
[0]:
|
| 670 |
+
time : 2025-09-17_13:45:23
|
| 671 |
+
host : TENCENT64.site
|
| 672 |
+
rank : 4 (local_rank: 4)
|
| 673 |
+
exitcode : 1 (pid: 216887)
|
| 674 |
+
error_file: <N/A>
|
| 675 |
+
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
| 676 |
+
============================================================
|
log/20250917-13:46:26.log
ADDED
|
@@ -0,0 +1,675 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 0 |
|
|
|
|
| 1 |
|
|
|
|
|
|
| 1 |
+
run sh: `/root/miniconda3/envs/ms-swift/bin/python3.10 -m torch.distributed.run --nproc_per_node 8 /group/40143/hongzhuyi/ms-swift/swift/cli/sft.py --torch_dtype bfloat16 --freeze_llm false --freeze_aligner false --model Qwen/Qwen2.5-7B-Instruct --train_type full --dataset /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl /group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl /group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl --model_type qwen2_5 --dataset_num_proc 100 --dataloader_num_workers 48 --split_dataset_ratio 0.001 --warmup_ratio 0.05 --num_train_epochs 2 --per_device_train_batch_size 2 --learning_rate 5e-6 --gradient_accumulation_steps 4 --eval_steps 2000 --save_strategy epoch --logging_steps 1 --deepspeed zero3 --max_length 16240 --output_dir ./output`
|
| 2 |
+
|
| 3 |
+
*****************************************
|
| 4 |
+
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 5 |
+
*****************************************
|
| 6 |
+
[INFO:swift] Successfully registered `/group/40143/hongzhuyi/ms-swift/swift/llm/dataset/data/dataset_info.json`.
|
| 7 |
+
[INFO:swift] rank: 0, local_rank: 0, world_size: 8, local_world_size: 8
|
| 8 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 9 |
+
[2025-09-17 13:46:39,893] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 10 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 11 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 12 |
+
[2025-09-17 13:46:41,247] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 13 |
+
[2025-09-17 13:46:41,255] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 14 |
+
[2025-09-17 13:46:41,564] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 15 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 16 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 17 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 18 |
+
[INFO:swift] Setting args.lazy_tokenize: False
|
| 19 |
+
[2025-09-17 13:46:42,898] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 20 |
+
[2025-09-17 13:46:42,906] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 21 |
+
[INFO:swift] Using deepspeed: {'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}
|
| 22 |
+
[2025-09-17 13:46:43,796] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 23 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 24 |
+
[2025-09-17 13:46:45,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 25 |
+
[2025-09-17 13:46:45,109] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 26 |
+
[2025-09-17 13:46:45,117] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 27 |
+
[2025-09-17 13:46:45,117] [INFO] [comm.py:852:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 28 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 29 |
+
[2025-09-17 13:46:46,411] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 30 |
+
[2025-09-17 13:46:46,420] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 31 |
+
[2025-09-17 13:46:46,916] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 32 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 33 |
+
[2025-09-17 13:46:48,428] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 34 |
+
[2025-09-17 13:46:48,437] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 35 |
+
[2025-09-17 13:46:48,861] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 36 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 37 |
+
[2025-09-17 13:46:50,317] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 38 |
+
[2025-09-17 13:46:50,474] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 39 |
+
[2025-09-17 13:46:50,483] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 40 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 41 |
+
[2025-09-17 13:46:51,692] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 42 |
+
[2025-09-17 13:46:51,701] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 43 |
+
[2025-09-17 13:46:51,819] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 44 |
+
[2025-09-17 13:46:53,148] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
|
| 45 |
+
[2025-09-17 13:46:53,156] [INFO] [comm.py:821:init_distributed] cdb=None
|
| 46 |
+
[INFO:swift] output_dir: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655
|
| 47 |
+
[INFO:swift] Global seed set to 42
|
| 48 |
+
[INFO:swift] args: TrainArguments(
|
| 49 |
+
_n_gpu=-1,
|
| 50 |
+
acc_strategy=token,
|
| 51 |
+
accelerator_config={'dispatch_batches': False},
|
| 52 |
+
adafactor=False,
|
| 53 |
+
adalora_beta1=0.85,
|
| 54 |
+
adalora_beta2=0.85,
|
| 55 |
+
adalora_deltaT=1,
|
| 56 |
+
adalora_init_r=12,
|
| 57 |
+
adalora_orth_reg_weight=0.5,
|
| 58 |
+
adalora_target_r=8,
|
| 59 |
+
adalora_tfinal=0,
|
| 60 |
+
adalora_tinit=0,
|
| 61 |
+
adam_beta1=0.9,
|
| 62 |
+
adam_beta2=0.95,
|
| 63 |
+
adam_epsilon=1e-08,
|
| 64 |
+
adapter_act=gelu,
|
| 65 |
+
adapter_length=128,
|
| 66 |
+
adapters=[],
|
| 67 |
+
add_version=True,
|
| 68 |
+
agent_template=None,
|
| 69 |
+
aligner_lr=None,
|
| 70 |
+
attn_impl=None,
|
| 71 |
+
auto_find_batch_size=False,
|
| 72 |
+
average_tokens_across_devices=True,
|
| 73 |
+
batch_eval_metrics=False,
|
| 74 |
+
bf16=True,
|
| 75 |
+
bf16_full_eval=False,
|
| 76 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 77 |
+
bnb_4bit_quant_storage=None,
|
| 78 |
+
bnb_4bit_quant_type=nf4,
|
| 79 |
+
bnb_4bit_use_double_quant=True,
|
| 80 |
+
boft_block_num=0,
|
| 81 |
+
boft_block_size=4,
|
| 82 |
+
boft_dropout=0.0,
|
| 83 |
+
boft_n_butterfly_factor=1,
|
| 84 |
+
cached_dataset=[],
|
| 85 |
+
channels=None,
|
| 86 |
+
check_model=True,
|
| 87 |
+
ckpt_dir=None,
|
| 88 |
+
columns={},
|
| 89 |
+
create_checkpoint_symlink=False,
|
| 90 |
+
custom_dataset_info=[],
|
| 91 |
+
custom_register_path=[],
|
| 92 |
+
data_seed=42,
|
| 93 |
+
dataloader_drop_last=False,
|
| 94 |
+
dataloader_num_workers=48,
|
| 95 |
+
dataloader_persistent_workers=False,
|
| 96 |
+
dataloader_pin_memory=True,
|
| 97 |
+
dataloader_prefetch_factor=None,
|
| 98 |
+
dataset=['/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl', '/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl'],
|
| 99 |
+
dataset_num_proc=100,
|
| 100 |
+
dataset_shuffle=True,
|
| 101 |
+
ddp_backend=None,
|
| 102 |
+
ddp_broadcast_buffers=None,
|
| 103 |
+
ddp_bucket_cap_mb=None,
|
| 104 |
+
ddp_find_unused_parameters=None,
|
| 105 |
+
ddp_timeout=18000000,
|
| 106 |
+
debug=None,
|
| 107 |
+
deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False},
|
| 108 |
+
deepspeed_autotp_size=None,
|
| 109 |
+
device_map=None,
|
| 110 |
+
disable_tqdm=None,
|
| 111 |
+
do_eval=False,
|
| 112 |
+
do_predict=False,
|
| 113 |
+
do_train=False,
|
| 114 |
+
download_mode=reuse_dataset_if_exists,
|
| 115 |
+
ds3_gather_for_generation=True,
|
| 116 |
+
early_stop_interval=None,
|
| 117 |
+
enable_dft_loss=False,
|
| 118 |
+
eval_accumulation_steps=None,
|
| 119 |
+
eval_dataset=[],
|
| 120 |
+
eval_dataset_args=None,
|
| 121 |
+
eval_delay=0,
|
| 122 |
+
eval_do_concat_batches=True,
|
| 123 |
+
eval_generation_config=None,
|
| 124 |
+
eval_limit=None,
|
| 125 |
+
eval_on_start=False,
|
| 126 |
+
eval_steps=2000.0,
|
| 127 |
+
eval_strategy=epoch,
|
| 128 |
+
eval_use_evalscope=False,
|
| 129 |
+
eval_use_gather_object=False,
|
| 130 |
+
external_plugins=[],
|
| 131 |
+
extra_eval_args=None,
|
| 132 |
+
fourier_n_frequency=2000,
|
| 133 |
+
fourier_scaling=300.0,
|
| 134 |
+
fp16=False,
|
| 135 |
+
fp16_backend=auto,
|
| 136 |
+
fp16_full_eval=False,
|
| 137 |
+
fp16_opt_level=O1,
|
| 138 |
+
freeze_aligner=False,
|
| 139 |
+
freeze_llm=False,
|
| 140 |
+
freeze_parameters=[],
|
| 141 |
+
freeze_parameters_ratio=0.0,
|
| 142 |
+
freeze_parameters_regex=None,
|
| 143 |
+
freeze_vit=True,
|
| 144 |
+
fsdp=,
|
| 145 |
+
fsdp_config=None,
|
| 146 |
+
fsdp_min_num_params=0,
|
| 147 |
+
fsdp_transformer_layer_cls_to_wrap=None,
|
| 148 |
+
full_determinism=False,
|
| 149 |
+
galore_cos_threshold=0.4,
|
| 150 |
+
galore_gamma_proj=2,
|
| 151 |
+
galore_optim_per_parameter=False,
|
| 152 |
+
galore_proj_bits=4,
|
| 153 |
+
galore_proj_group_size=256,
|
| 154 |
+
galore_proj_quant=False,
|
| 155 |
+
galore_proj_type=std,
|
| 156 |
+
galore_quantization=False,
|
| 157 |
+
galore_queue_size=5,
|
| 158 |
+
galore_rank=128,
|
| 159 |
+
galore_scale=1.0,
|
| 160 |
+
galore_target_modules=None,
|
| 161 |
+
galore_update_proj_gap=50,
|
| 162 |
+
galore_with_embedding=False,
|
| 163 |
+
generation_config=None,
|
| 164 |
+
generation_max_length=None,
|
| 165 |
+
generation_num_beams=None,
|
| 166 |
+
gradient_accumulation_steps=4,
|
| 167 |
+
gradient_checkpointing=True,
|
| 168 |
+
gradient_checkpointing_kwargs=None,
|
| 169 |
+
greater_is_better=False,
|
| 170 |
+
group_by_length=False,
|
| 171 |
+
half_precision_backend=auto,
|
| 172 |
+
hqq_axis=None,
|
| 173 |
+
hub_always_push=False,
|
| 174 |
+
hub_model_id=None,
|
| 175 |
+
hub_private_repo=None,
|
| 176 |
+
hub_revision=None,
|
| 177 |
+
hub_strategy=every_save,
|
| 178 |
+
hub_token=<HUB_TOKEN>,
|
| 179 |
+
ignore_args_error=False,
|
| 180 |
+
ignore_data_skip=False,
|
| 181 |
+
include_for_metrics=[],
|
| 182 |
+
include_inputs_for_metrics=False,
|
| 183 |
+
include_num_input_tokens_seen=False,
|
| 184 |
+
include_tokens_per_second=False,
|
| 185 |
+
init_strategy=None,
|
| 186 |
+
init_weights=True,
|
| 187 |
+
interleave_prob=None,
|
| 188 |
+
jit_mode_eval=False,
|
| 189 |
+
label_names=None,
|
| 190 |
+
label_smoothing_factor=0.0,
|
| 191 |
+
lazy_tokenize=False,
|
| 192 |
+
learning_rate=5e-06,
|
| 193 |
+
length_column_name=length,
|
| 194 |
+
liger_kernel_config=None,
|
| 195 |
+
lisa_activated_layers=0,
|
| 196 |
+
lisa_step_interval=20,
|
| 197 |
+
llamapro_num_groups=None,
|
| 198 |
+
llamapro_num_new_blocks=4,
|
| 199 |
+
load_args=False,
|
| 200 |
+
load_best_model_at_end=False,
|
| 201 |
+
load_data_args=False,
|
| 202 |
+
load_from_cache_file=True,
|
| 203 |
+
local_rank=0,
|
| 204 |
+
local_repo_path=None,
|
| 205 |
+
log_level=passive,
|
| 206 |
+
log_level_replica=warning,
|
| 207 |
+
log_on_each_node=True,
|
| 208 |
+
logging_dir=/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/runs,
|
| 209 |
+
logging_first_step=True,
|
| 210 |
+
logging_nan_inf_filter=True,
|
| 211 |
+
logging_steps=1,
|
| 212 |
+
logging_strategy=steps,
|
| 213 |
+
logprobs=False,
|
| 214 |
+
lora_alpha=32,
|
| 215 |
+
lora_bias=none,
|
| 216 |
+
lora_dropout=0.05,
|
| 217 |
+
lora_dtype=None,
|
| 218 |
+
lora_ga_batch_size=2,
|
| 219 |
+
lora_ga_direction=ArB2r,
|
| 220 |
+
lora_ga_iters=2,
|
| 221 |
+
lora_ga_max_length=1024,
|
| 222 |
+
lora_ga_scale=stable,
|
| 223 |
+
lora_ga_stable_gamma=16,
|
| 224 |
+
lora_modules=[],
|
| 225 |
+
lora_rank=8,
|
| 226 |
+
lorap_lr_ratio=None,
|
| 227 |
+
loss_scale=default,
|
| 228 |
+
loss_type=None,
|
| 229 |
+
lr_scheduler_kwargs=None,
|
| 230 |
+
lr_scheduler_type=cosine,
|
| 231 |
+
max_epochs=None,
|
| 232 |
+
max_grad_norm=1.0,
|
| 233 |
+
max_length=16240,
|
| 234 |
+
max_memory={},
|
| 235 |
+
max_model_len=None,
|
| 236 |
+
max_new_tokens=64,
|
| 237 |
+
max_pixels=None,
|
| 238 |
+
max_steps=-1,
|
| 239 |
+
metric=None,
|
| 240 |
+
metric_for_best_model=loss,
|
| 241 |
+
model=Qwen/Qwen2.5-7B-Instruct,
|
| 242 |
+
model_author=None,
|
| 243 |
+
model_kwargs={},
|
| 244 |
+
model_name=None,
|
| 245 |
+
model_revision=None,
|
| 246 |
+
model_type=qwen2_5,
|
| 247 |
+
modules_to_save=[],
|
| 248 |
+
mp_parameters=,
|
| 249 |
+
neftune_noise_alpha=None,
|
| 250 |
+
new_special_tokens=[],
|
| 251 |
+
no_cuda=False,
|
| 252 |
+
norm_bbox=None,
|
| 253 |
+
num_beams=1,
|
| 254 |
+
num_labels=None,
|
| 255 |
+
num_train_epochs=2.0,
|
| 256 |
+
optim=adamw_torch_fused,
|
| 257 |
+
optim_args=None,
|
| 258 |
+
optim_target_modules=None,
|
| 259 |
+
optimizer=None,
|
| 260 |
+
output_dir=/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655,
|
| 261 |
+
overwrite_output_dir=False,
|
| 262 |
+
packing=False,
|
| 263 |
+
packing_length=None,
|
| 264 |
+
padding_free=False,
|
| 265 |
+
padding_side=right,
|
| 266 |
+
past_index=-1,
|
| 267 |
+
per_device_eval_batch_size=1,
|
| 268 |
+
per_device_train_batch_size=2,
|
| 269 |
+
predict_with_generate=False,
|
| 270 |
+
prediction_loss_only=False,
|
| 271 |
+
problem_type=None,
|
| 272 |
+
push_to_hub=False,
|
| 273 |
+
push_to_hub_model_id=None,
|
| 274 |
+
push_to_hub_organization=None,
|
| 275 |
+
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
|
| 276 |
+
quant_bits=None,
|
| 277 |
+
quant_method=None,
|
| 278 |
+
ray_scope=last,
|
| 279 |
+
reft_args=None,
|
| 280 |
+
reft_intervention_type=LoreftIntervention,
|
| 281 |
+
reft_layer_key=None,
|
| 282 |
+
reft_layers=None,
|
| 283 |
+
reft_rank=4,
|
| 284 |
+
remove_unused_columns=True,
|
| 285 |
+
repetition_penalty=None,
|
| 286 |
+
report_to=['tensorboard'],
|
| 287 |
+
response_prefix=None,
|
| 288 |
+
restore_callback_states_from_checkpoint=False,
|
| 289 |
+
resume_from_checkpoint=None,
|
| 290 |
+
resume_only_model=False,
|
| 291 |
+
rope_scaling=None,
|
| 292 |
+
router_aux_loss_coef=0.0,
|
| 293 |
+
run_name=/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655,
|
| 294 |
+
save_on_each_node=False,
|
| 295 |
+
save_only_model=False,
|
| 296 |
+
save_safetensors=True,
|
| 297 |
+
save_steps=500,
|
| 298 |
+
save_strategy=epoch,
|
| 299 |
+
save_total_limit=None,
|
| 300 |
+
seed=42,
|
| 301 |
+
sequence_parallel_size=1,
|
| 302 |
+
shuffle_buffer_size=1000,
|
| 303 |
+
skip_memory_metrics=True,
|
| 304 |
+
sortish_sampler=False,
|
| 305 |
+
split_dataset_ratio=0.001,
|
| 306 |
+
stop_words=[],
|
| 307 |
+
stopping_strategy=first_exhausted,
|
| 308 |
+
stream=False,
|
| 309 |
+
streaming=False,
|
| 310 |
+
strict=False,
|
| 311 |
+
swanlab_exp_name=None,
|
| 312 |
+
swanlab_lark_secret=None,
|
| 313 |
+
swanlab_lark_webhook_url=None,
|
| 314 |
+
swanlab_mode=cloud,
|
| 315 |
+
swanlab_project=None,
|
| 316 |
+
swanlab_token=<SWANLAB_TOKEN>,
|
| 317 |
+
swanlab_workspace=None,
|
| 318 |
+
system=None,
|
| 319 |
+
target_modules=['all-linear'],
|
| 320 |
+
target_regex=None,
|
| 321 |
+
task_type=causal_lm,
|
| 322 |
+
temperature=0.0,
|
| 323 |
+
template=qwen2_5,
|
| 324 |
+
template_backend=swift,
|
| 325 |
+
tf32=None,
|
| 326 |
+
top_k=None,
|
| 327 |
+
top_logprobs=None,
|
| 328 |
+
top_p=None,
|
| 329 |
+
torch_compile=False,
|
| 330 |
+
torch_compile_backend=None,
|
| 331 |
+
torch_compile_mode=None,
|
| 332 |
+
torch_dtype=torch.bfloat16,
|
| 333 |
+
torch_empty_cache_steps=None,
|
| 334 |
+
torchdynamo=None,
|
| 335 |
+
tpu_metrics_debug=False,
|
| 336 |
+
tpu_num_cores=None,
|
| 337 |
+
train_dataloader_shuffle=True,
|
| 338 |
+
train_type=full,
|
| 339 |
+
trainable_parameters=[],
|
| 340 |
+
trainable_parameters_regex=None,
|
| 341 |
+
truncation_strategy=delete,
|
| 342 |
+
tuner_backend=peft,
|
| 343 |
+
use_chat_template=True,
|
| 344 |
+
use_cpu=False,
|
| 345 |
+
use_dora=False,
|
| 346 |
+
use_flash_ckpt=False,
|
| 347 |
+
use_galore=False,
|
| 348 |
+
use_hf=False,
|
| 349 |
+
use_ipex=False,
|
| 350 |
+
use_legacy_prediction_loop=False,
|
| 351 |
+
use_liger_kernel=False,
|
| 352 |
+
use_logits_to_keep=None,
|
| 353 |
+
use_mps_device=False,
|
| 354 |
+
use_rslora=False,
|
| 355 |
+
use_swift_lora=False,
|
| 356 |
+
val_dataset=[],
|
| 357 |
+
val_dataset_shuffle=False,
|
| 358 |
+
vera_d_initial=0.1,
|
| 359 |
+
vera_dropout=0.0,
|
| 360 |
+
vera_projection_prng_key=0,
|
| 361 |
+
vera_rank=256,
|
| 362 |
+
vit_gradient_checkpointing=None,
|
| 363 |
+
vit_lr=None,
|
| 364 |
+
warmup_ratio=0.05,
|
| 365 |
+
warmup_steps=0,
|
| 366 |
+
weight_decay=0.1,
|
| 367 |
+
zero_hpz_partition_size=None,
|
| 368 |
+
)
|
| 369 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 370 |
+
[INFO:swift] Downloading the model from ModelScope Hub, model_id: Qwen/Qwen2.5-7B-Instruct
|
| 371 |
+
[2025-09-17 13:46:58,379] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 372 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 373 |
+
[INFO:modelscope] Target directory already exists, skipping creation.
|
| 374 |
+
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct
|
| 375 |
+
[INFO:swift] model_kwargs: {'device_map': None}
|
| 376 |
+
[2025-09-17 13:46:59,975] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 377 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 378 |
+
[2025-09-17 13:47:01,668] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 379 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 380 |
+
[2025-09-17 13:47:03,273] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 381 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 382 |
+
[2025-09-17 13:47:04,917] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 383 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 384 |
+
[2025-09-17 13:47:06,546] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 385 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 386 |
+
[2025-09-17 13:47:08,173] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 387 |
+
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-7B-Instruct
|
| 388 |
+
[2025-09-17 13:47:09,832] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 8
|
| 389 |
+
[2025-09-17 13:47:09,976] [INFO] [partition_parameters.py:366:__exit__] finished initializing model - num_params = 339, num_elems = 7.62B
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
[INFO:swift] model_info: ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=Qwen2Config {
|
| 399 |
+
"architectures": [
|
| 400 |
+
"Qwen2ForCausalLM"
|
| 401 |
+
],
|
| 402 |
+
"attention_dropout": 0.0,
|
| 403 |
+
"bos_token_id": 151643,
|
| 404 |
+
"eos_token_id": 151645,
|
| 405 |
+
"hidden_act": "silu",
|
| 406 |
+
"hidden_size": 3584,
|
| 407 |
+
"initializer_range": 0.02,
|
| 408 |
+
"intermediate_size": 18944,
|
| 409 |
+
"layer_types": [
|
| 410 |
+
"full_attention",
|
| 411 |
+
"full_attention",
|
| 412 |
+
"full_attention",
|
| 413 |
+
"full_attention",
|
| 414 |
+
"full_attention",
|
| 415 |
+
"full_attention",
|
| 416 |
+
"full_attention",
|
| 417 |
+
"full_attention",
|
| 418 |
+
"full_attention",
|
| 419 |
+
"full_attention",
|
| 420 |
+
"full_attention",
|
| 421 |
+
"full_attention",
|
| 422 |
+
"full_attention",
|
| 423 |
+
"full_attention",
|
| 424 |
+
"full_attention",
|
| 425 |
+
"full_attention",
|
| 426 |
+
"full_attention",
|
| 427 |
+
"full_attention",
|
| 428 |
+
"full_attention",
|
| 429 |
+
"full_attention",
|
| 430 |
+
"full_attention",
|
| 431 |
+
"full_attention",
|
| 432 |
+
"full_attention",
|
| 433 |
+
"full_attention",
|
| 434 |
+
"full_attention",
|
| 435 |
+
"full_attention",
|
| 436 |
+
"full_attention",
|
| 437 |
+
"full_attention"
|
| 438 |
+
],
|
| 439 |
+
"max_position_embeddings": 32768,
|
| 440 |
+
"max_window_layers": 28,
|
| 441 |
+
"model_type": "qwen2",
|
| 442 |
+
"num_attention_heads": 28,
|
| 443 |
+
"num_hidden_layers": 28,
|
| 444 |
+
"num_key_value_heads": 4,
|
| 445 |
+
"pad_token_id": 151643,
|
| 446 |
+
"rms_norm_eps": 1e-06,
|
| 447 |
+
"rope_scaling": null,
|
| 448 |
+
"rope_theta": 1000000.0,
|
| 449 |
+
"sliding_window": null,
|
| 450 |
+
"tie_word_embeddings": false,
|
| 451 |
+
"torch_dtype": "bfloat16",
|
| 452 |
+
"transformers_version": "4.55.4",
|
| 453 |
+
"use_cache": true,
|
| 454 |
+
"use_sliding_window": false,
|
| 455 |
+
"vocab_size": 152064
|
| 456 |
+
}
|
| 457 |
+
, task_type='causal_lm', num_labels=None)
|
| 458 |
+
[INFO:swift] model.generation_config: GenerationConfig {
|
| 459 |
+
"bos_token_id": 151643,
|
| 460 |
+
"eos_token_id": [
|
| 461 |
+
151645,
|
| 462 |
+
151643
|
| 463 |
+
],
|
| 464 |
+
"max_new_tokens": 64,
|
| 465 |
+
"pad_token_id": 151643,
|
| 466 |
+
"repetition_penalty": 1.05
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
[INFO:swift] default_system: 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.'
|
| 470 |
+
[INFO:swift] max_length: 16240
|
| 471 |
+
[INFO:swift] response_prefix: ''
|
| 472 |
+
[INFO:swift] agent_template: hermes
|
| 473 |
+
[INFO:swift] Start time of running main: 2025-09-17 13:47:12.047609
|
| 474 |
+
[INFO:swift] swift.__version__: 3.8.0.dev0
|
| 475 |
+
Setting num_proc from 100 back to 1 for the train split to disable multiprocessing as it only contains one shard.
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
Setting num_proc from 100 back to 1 for the train split to disable multiprocessing as it only contains one shard.
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
[INFO:swift] train_dataset: Dataset({
|
| 482 |
+
features: ['messages'],
|
| 483 |
+
num_rows: 23973
|
| 484 |
+
})
|
| 485 |
+
[INFO:swift] val_dataset: Dataset({
|
| 486 |
+
features: ['messages'],
|
| 487 |
+
num_rows: 21
|
| 488 |
+
})
|
| 489 |
+
[INFO:swift] The split dataset from the training set will be saved at: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/val_dataset.jsonl.
|
| 490 |
+
|
| 491 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 492 |
+
|
| 493 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 494 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 495 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 496 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 497 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 498 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 499 |
+
num_proc must be <= 21. Reducing num_proc to 21 for dataset of size 21.
|
| 500 |
+
[INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 264, 6929, 16230, 17847, 6188, 311, 9026, 3019, 14319, 29208, 6929, 7525, 29720, 323, 23638, 311, 4583, 279, 1196, 594, 3383, 13, 1446, 525, 3897, 448, 3151, 9079, 323, 44610, 13904, 1995, 11, 323, 498, 1184, 311, 2550, 13382, 6168, 311, 22054, 279, 1196, 594, 3383, 382, 8420, 594, 279, 1995, 498, 3278, 614, 510, 785, 1196, 594, 16538, 25, 1096, 374, 279, 3383, 498, 2299, 4460, 311, 4583, 624, 785, 1482, 3482, 2150, 594, 39700, 4916, 25, 1096, 374, 264, 43799, 13042, 315, 279, 44610, 11, 8241, 1376, 1995, 624, 785, 1787, 22398, 25, 4220, 525, 279, 22398, 498, 614, 1787, 624, 785, 3681, 6168, 25, 2619, 525, 279, 6168, 498, 1101, 10660, 13, 1084, 1231, 387, 10950, 311, 3754, 697, 5098, 382, 785, 6168, 498, 646, 2736, 4399, 1119, 3807, 11059, 1447, 2665, 16730, 26722, 510, 63, 3678, 508, 307, 60, 508, 1796, 60, 44622, 1096, 1917, 27749, 389, 458, 2392, 448, 264, 3151, 877, 389, 279, 44610, 624, 63, 1313, 508, 307, 60, 508, 1796, 60, 508, 1873, 37480, 19844, 28, 15, 91, 16, 60, 44622, 5443, 419, 311, 943, 279, 2213, 1119, 279, 2070, 448, 877, 13, 3216, 1638, 11, 279, 1591, 6269, 3014, 1376, 374, 17320, 1283, 19496, 7241, 3493, 37480, 19844, 374, 738, 311, 220, 15, 624, 63, 17583, 508, 307, 60, 508, 1796, 60, 44622, 85569, 916, 458, 2392, 448, 877, 624, 63, 1873, 508, 792, 34454, 60, 44622, 220, 4467, 23156, 279, 25352, 315, 264, 1376, 10601, 389, 279, 13625, 320, 68, 1302, 2572, 37014, 98267, 4292, 63, 12605, 508, 2923, 91, 454, 60, 44622, 22392, 279, 2150, 705, 476, 1495, 382, 8582, 9551, 26722, 510, 63, 931, 17344, 44622, 5264, 264, 501, 11, 4287, 6929, 5651, 624, 63, 6192, 47492, 508, 6192, 3560, 60, 44622, 15586, 279, 6929, 594, 5244, 311, 264, 3151, 5651, 1667, 1181, 1922, 624, 63, 5552, 17344, 44622, 13032, 279, 5023, 4541, 5651, 382, 3144, 17980, 26722, 510, 63, 28535, 508, 1085, 60, 44622, 81739, 311, 264, 3151, 5548, 624, 63, 3346, 3895, 44622, 81739, 311, 279, 8597, 19334, 2150, 624, 63, 3346, 32121, 44622, 81739, 311, 279, 1790, 2150, 320, 333, 264, 3681, 364, 3346, 3895, 6, 1917, 572, 10660, 3593, 33190, 5586, 510, 63, 9495, 508, 9217, 60, 44622, 25226, 419, 1917, 979, 498, 4411, 279, 3383, 374, 4583, 13, 1416, 279, 16538, 374, 311, 1477, 264, 1467, 5980, 4226, 11, 3410, 279, 4226, 304, 279, 31642, 13, 1416, 498, 4411, 279, 3383, 374, 11997, 311, 4583, 11, 3410, 279, 4226, 438, 1591, 45, 10360, 3014, 304, 279, 31642, 382, 1249, 387, 6849, 11, 432, 374, 1602, 2989, 311, 1795, 279, 2701, 5601, 510, 16, 13, 1446, 1265, 1172, 4265, 458, 1917, 429, 374, 2697, 2661, 279, 1482, 21930, 624, 17, 13, 1446, 1265, 1172, 4265, 825, 1917, 518, 264, 882, 624, 18, 13, 1446, 1265, 1795, 279, 10295, 311, 2874, 3019, 553, 3019, 323, 1221, 4265, 279, 1790, 1917, 624, 19, 13, 1446, 1265, 8300, 311, 13656, 6168, 979, 4265, 458, 1917, 323, 1430, 537, 311, 1281, 58077, 6168, 198, 20, 13, 2009, 32711, 1969, 387, 4766, 30586, 26865, 1472, 26865, 43626, 9492, 11, 323, 1052, 1969, 387, 902, 2550, 1573, 30586, 26865, 1472, 26865, 29, 18639, 21, 13, 4636, 30586, 26865, 1472, 26865, 29, 7808, 1172, 279, 1917, 1265, 387, 7907, 304, 279, 4396, 3561, 11, 43810, 304, 2038, 69155, 13, 1752, 3110, 510, 256, 366, 26865, 41993, 3137, 5868, 9760, 311, 847, 5795, 13, 9189, 287, 432, 1265, 1896, 752, 311, 279, 1790, 3019, 3918, 26865, 397, 256, 54275, 3678, 508, 307, 60, 508, 1796, 60, 13874, 3989, 22, 13, 25226, 279, 2936, 1917, 979, 498, 1744, 498, 614, 16994, 279, 16538, 13, 4320, 1405, 6923, 4113, 1283, 2936, 624, 23, 13, 23240, 3561, 6168, 12440, 25, 715, 73594, 5631, 508, 13786, 60, 13874, 3989, 2461, 3110, 11, 421, 15039, 369, 1591, 61907, 2802, 41612, 304, 279, 2274, 3014, 304, 264, 2711, 2070, 448, 3034, 1565, 17, 16, 7808, 12440, 3561, 432, 438, 510, 73594, 1313, 508, 17, 16, 60, 508, 61907, 2802, 41612, 304, 279, 2274, 60, 508, 16, 60, 13874, 3989, 52116, 15114, 19856, 429, 51044, 38929, 2163, 5029, 476, 24064, 2750, 624, 151645, 198, 151644, 872, 271, 78306, 25, 879, 702, 279, 1429, 3842, 2256, 19212, 24544, 198, 37763, 367, 25, 220, 508, 16, 20, 21, 24, 60, 18854, 5981, 8726, 364, 19284, 20288, 10058, 6, 10735, 25, 3007, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 198, 197, 58, 17, 23, 17, 24, 60, 73999, 330, 5890, 364, 54, 14939, 14913, 2567, 25, 3557, 198, 197, 58, 17, 23, 18, 16, 60, 31300, 3355, 10067, 25, 895, 198, 197, 58, 17, 23, 18, 23, 60, 2656, 364, 10850, 311, 10565, 2150, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 5894, 197, 197, 58, 17, 23, 18, 24, 60, 3137, 364, 145574, 1248, 197, 58, 17, 23, 19, 15, 60, 2656, 330, 10850, 311, 279, 1887, 2150, 315, 364, 54, 14939, 14913, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 5894, 197, 197, 58, 17, 23, 19, 16, 60, 3137, 364, 54, 14939, 1248, 197, 58, 17, 23, 19, 17, 60, 2656, 364, 10850, 311, 264, 26618, 4091, 2150, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 79442, 30, 1796, 63417, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 198, 197, 197, 58, 17, 23, 19, 18, 60, 3137, 364, 147724, 1248, 197, 58, 17, 23, 20, 15, 60, 14496, 364, 19284, 20288, 10058, 6, 702, 24381, 25, 5022, 198, 197, 58, 16, 21, 21, 24, 60, 23105, 1178, 364, 785, 7297, 20288, 10058, 374, 458, 10084, 304, 279, 7885, 197, 58, 17, 23, 20, 20, 60, 2656, 364, 30812, 20761, 8953, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 20290, 1663, 1400, 1905, 3959, 62, 70107, 198, 197, 58, 17, 23, 20, 21, 60, 2656, 364, 19284, 20288, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 198, 197, 58, 16, 21, 22, 20, 60, 23105, 1178, 6256, 8704, 1172, 825, 7885, 197, 58, 17, 23, 20, 22, 60, 2656, 364, 53, 1701, 62818, 36389, 55104, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 27233, 1701, 2351, 2855, 36389, 1139, 57075, 198, 197, 58, 16, 21, 22, 22, 60, 23105, 1178, 364, 374, 21328, 311, 279, 2083, 320, 78147, 8, 5086, 11, 279, 7297, 20288, 10058, 6081, 264, 6530, 480, 296, 75284, 369, 279, 5042, 4217, 323, 2083, 3613, 311, 2506, 369, 5577, 311, 7735, 551, 862, 12560, 23421, 197, 58, 17, 23, 20, 24, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 16, 198, 197, 58, 16, 21, 22, 23, 60, 23105, 1178, 364, 2619, 525, 1083, 24544, 3897, 311, 279, 38280, 5239, 2083, 315, 279, 7297, 20288, 23421, 197, 58, 17, 23, 21, 17, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 17, 198, 197, 58, 17, 22, 24, 16, 60, 2168, 3355, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 38151, 11374, 3466, 1668, 9605, 6859, 43, 34683, 4819, 6411, 79, 198, 197, 58, 16, 21, 22, 24, 60, 23105, 1178, 364, 785, 50455, 7885, 197, 58, 17, 23, 21, 23, 60, 2656, 364, 19284, 20288, 29881, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 6859, 43, 198, 197, 58, 16, 21, 23, 16, 60, 23105, 1178, 364, 10058, 1248, 197, 58, 17, 22, 24, 17, 60, 2168, 3355, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 38151, 14, 22493, 18522, 2142, 1020, 81, 819, 4819, 6411, 79, 198, 197, 58, 17, 23, 22, 17, 60, 2656, 364, 40344, 576, 2142, 1020, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 31701, 4644, 62, 785, 2142, 1020, 198, 197, 58, 17, 23, 22, 18, 60, 2656, 364, 19284, 20288, 52594, 5543, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 6859, 53, 5543, 198, 197, 58, 16, 21, 23, 20, 60, 23105, 1178, 364, 18621, 10058, 320, 1291, 39567, 197, 58, 16, 21, 23, 21, 60, 23105, 1178, 364, 49, 819, 525, 1083, 21328, 311, 3613, 315, 279, 2083, 879, 14816, 279, 63536, 476, 47205, 21553, 23421, 197, 58, 17, 23, 22, 21, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 18, 198, 197, 58, 17, 23, 22, 24, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 19, 198, 197, 58, 17, 23, 23, 17, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 20, 198, 197, 58, 16, 21, 23, 22, 60, 23105, 1178, 364, 576, 12588, 1083, 5707, 55371, 2291, 311, 678, 4217, 438, 1293, 438, 807, 614, 7391, 518, 3245, 2326, 3868, 389, 862, 2083, 748, 4541, 476, 31799, 1140, 26, 279, 19380, 12037, 3220, 374, 76171, 20030, 2878, 264, 2083, 4221, 42706, 11, 59666, 11, 323, 15532, 4217, 23421, 197, 58, 17, 23, 23, 20, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 21, 198, 197, 58, 17, 23, 23, 22, 60, 1874, 11776, 197, 197, 58, 17, 23, 24, 17, 60, 77254, 51942, 364, 7799, 6, 17183, 25, 3007, 198, 298, 197, 58, 17, 23, 24, 18, 60, 14496, 364, 7799, 1248, 197, 197, 58, 16, 21, 24, 15, 60, 23105, 1178, 364, 9485, 24544, 525, 11136, 1865, 315, 7885, 197, 197, 58, 17, 23, 24, 20, 60, 2656, 364, 27869, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 15792, 813, 198, 197, 197, 58, 17, 23, 24, 21, 60, 2656, 364, 24847, 6623, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 19382, 960, 54696, 198, 197, 197, 58, 16, 21, 24, 19, 60, 23105, 1178, 364, 448, 7885, 197, 197, 58, 17, 23, 24, 22, 60, 2656, 364, 88576, 82, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 14953, 18479, 82, 198, 197, 197, 58, 16, 21, 24, 21, 60, 23105, 1178, 6256, 2379, 5990, 2924, 279, 2083, 829, 11, 2083, 12426, 11, 279, 7885, 197, 197, 58, 17, 23, 24, 23, 60, 2656, 364, 27710, 330, 10134, 28808, 20584, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 10270, 3035, 33609, 15774, 643, 3394, 11751, 13488, 1243, 16068, 62, 22372, 62, 23256, 8378, 920, 276, 2584, 2, 10253, 3575, 16068, 66696, 18695, 17, 17, 14615, 4138, 9132, 4, 17, 17, 198, 197, 197, 58, 16, 21, 24, 23, 60, 23105, 1178, 6614, 323, 279, 7297, 20288, 1372, 320, 42966, 16317, 304, 7885, 197, 197, 58, 17, 23, 24, 24, 60, 2656, 364, 60980, 7857, 1127, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 19382, 6908, 4273, 3253, 198, 197, 197, 58, 16, 22, 15, 15, 60, 23105, 1178, 49884, 8999, 24544, 4565, 48051, 304, 279, 6083, 315, 279, 7885, 197, 197, 58, 17, 24, 15, 15, 60, 2656, 364, 53, 1701, 62818, 36389, 55104, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 27233, 1701, 2351, 2855, 36389, 1139, 57075, 198, 197, 197, 58, 16, 22, 15, 17, 60, 23105, 1178, 364, 476, 264, 7885, 197, 197, 58, 17, 22, 23, 20, 60, 2656, 364, 84336, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 10360, 15717, 75759, 3959, 55808, 198, 197, 197, 58, 16, 22, 15, 19, 60, 23105, 1178, 6614, 311, 40368, 279, 1372, 315, 7297, 18436, 4730, 429, 279, 19024, 702, 2765, 23421, 197, 197, 58, 17, 24, 15, 17, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 22, 198, 197, 197, 58, 17, 24, 15, 20, 60, 2656, 364, 1294, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 11374, 3466, 1668, 9605, 34683, 2, 67075, 27207, 12, 23, 198, 197, 197, 58, 16, 22, 15, 21, 60, 23105, 1178, 364, 7496, 24544, 525, 27548, 553, 16035, 93350, 2813, 7885, 197, 197, 58, 17, 24, 15, 22, 60, 2656, 364, 41, 535, 724, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 31701, 535, 724, 198, 197, 197, 58, 16, 22, 15, 23, 60, 23105, 1178, 6256, 576, 24544, 3545, 525, 14279, 10449, 304, 458, 25777, 3745, 323, 5144, 3037, 1142, 23421, 197, 197, 58, 16, 22, 15, 24, 60, 23105, 1178, 364, 785, 10987, 2083, 646, 11136, 3042, 894, 1372, 315, 24544, 311, 420, 635, 423, 807, 5157, 11, 2670, 5990, 11, 714, 537, 7199, 311, 25, 4217, 320, 3028, 21445, 11, 31799, 21445, 11, 476, 7885, 197, 197, 58, 17, 24, 15, 24, 60, 2656, 364, 258, 79391, 20499, 6, 2515, 25, 3703, 1110, 83, 7272, 2143, 24138, 25502, 6324, 14939, 6205, 5705, 6345, 72, 62, 17, 15, 17, 17, 12, 15, 20, 10360, 14, 641, 79391, 89591, 2019, 198, 197, 197, 58, 16, 22, 16, 16, 60, 23105, 1178, 89384, 23956, 11, 50091, 11, 28409, 11, 16849, 11, 323, 4586, 5570, 13, 4329, 7263, 614, 2661, 24544, 311, 4741, 4217, 323, 23956, 429, 1033, 389, 279, 2083, 518, 1045, 1459, 2337, 279, 3200, 11, 8818, 537, 3432, 1012, 389, 279, 10987, 21445, 369, 279, 7297, 20288, 5086, 23421, 39, 35586, 21866, 25, 5872, 1313, 508, 18, 18, 15, 60, 508, 19284, 20288, 24544, 60, 508, 16, 921, 3678, 508, 16, 17, 20, 17, 60, 508, 19284, 20288, 10058, 921, 151645, 198, 151644, 77091, 198, 13708, 766, 1784, 26865, 16357, 1482, 2150, 374, 911, 7297, 20288, 24544, 11, 714, 1558, 537, 3602, 3410, 279, 4226, 311, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 13, 2619, 374, 264, 3772, 44291, 330, 8441, 24544, 4565, 48051, 304, 279, 6083, 315, 279, 55132, 62818, 36389, 55104, 476, 264, 8964, 11, 311, 40368, 279, 1372, 315, 7297, 18436, 4730, 429, 279, 19024, 702, 2765, 1189, 1096, 30643, 518, 3842, 2781, 32897, 11, 438, 16087, 311, 2083, 58184, 13, 2014, 1477, 700, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 11, 358, 1265, 1401, 369, 264, 1140, 476, 6286, 315, 7775, 11, 10767, 553, 37976, 4623, 1495, 311, 1490, 421, 264, 1140, 476, 1965, 315, 3255, 2832, 32120, 7952, 3918, 26865, 397, 522, 26865, 397, 73594, 12605, 508, 2923, 60, 73594, 151645]
|
| 501 |
+
[INFO:swift] [INPUT] <|im_start|>system
|
| 502 |
+
You are a browser interaction assistant designed to execute step-by-step browser operations efficiently and precisely to complete the user's task. You are provided with specific tasks and webpage-related information, and you need to output accurate actions to accomplish the user's task.
|
| 503 |
+
|
| 504 |
+
Here's the information you'll have:
|
| 505 |
+
The user's objective: This is the task you're trying to complete.
|
| 506 |
+
The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
|
| 507 |
+
The open tabs: These are the tabs you have open.
|
| 508 |
+
The previous actions: There are the actions you just performed. It may be helpful to track your progress.
|
| 509 |
+
|
| 510 |
+
The actions you can perform fall into several categories:
|
| 511 |
+
|
| 512 |
+
Page Operation Actions:
|
| 513 |
+
`click [id] [content]`: This action clicks on an element with a specific id on the webpage.
|
| 514 |
+
`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the ""Enter"" key is pressed after typing unless press_enter_after is set to 0.
|
| 515 |
+
`hover [id] [content]`: Hover over an element with id.
|
| 516 |
+
`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
|
| 517 |
+
`scroll [down|up]`: Scroll the page up or down.
|
| 518 |
+
|
| 519 |
+
Tab Management Actions:
|
| 520 |
+
`new_tab`: Open a new, empty browser tab.
|
| 521 |
+
`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index.
|
| 522 |
+
`close_tab`: Close the currently active tab.
|
| 523 |
+
|
| 524 |
+
URL Navigation Actions:
|
| 525 |
+
`goto [url]`: Navigate to a specific URL.
|
| 526 |
+
`go_back`: Navigate to the previously viewed page.
|
| 527 |
+
`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed).
|
| 528 |
+
|
| 529 |
+
Completion Action:
|
| 530 |
+
`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as ""N/A"" in the bracket.
|
| 531 |
+
|
| 532 |
+
To be successful, it is very important to follow the following rules:
|
| 533 |
+
1. You should only issue an action that is valid given the current observation.
|
| 534 |
+
2. You should only issue one action at a time.
|
| 535 |
+
3. You should follow the examples to reason step by step and then issue the next action.
|
| 536 |
+
4. You should refer to historical actions when issue an action and try not to make repetitive actions
|
| 537 |
+
5. All reasoning must be inside `<think></think>` tags, and there must be no output before `<think></think>`.
|
| 538 |
+
6. After `<think></think>`, only the action should be generated in the correct format, enclosed in code fences. For example:
|
| 539 |
+
<think>This button looks relevant to my goal. Clicking it should take me to the next step.</think>
|
| 540 |
+
```click [id] [content]```
|
| 541 |
+
7. Issue the stop action when you think you have achieved the objective. Don’t generate anything after stop.
|
| 542 |
+
8. Always format actions correctly:
|
| 543 |
+
```command [parameters]```
|
| 544 |
+
For example, if searching for ""death row inmates in the US"" in a search field with ID `21`, correctly format it as:
|
| 545 |
+
```type [21] [death row inmates in the US] [1]```
|
| 546 |
+
Avoid incorrect formats that omit brackets around parameters or numeric values.
|
| 547 |
+
<|im_end|>
|
| 548 |
+
<|im_start|>user
|
| 549 |
+
|
| 550 |
+
Objective: who has the most individual super bowl rings
|
| 551 |
+
Observation: [1569] RootWebArea 'Super Bowl ring' focused: True url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring
|
| 552 |
+
[2829] textbox "Search 'Wikipedia'" required: False
|
| 553 |
+
[2831] checkbox '' checked: false
|
| 554 |
+
[2838] link 'Go to welcome page' url: https://tigerai.ca/
|
| 555 |
+
[2839] button '🏠'
|
| 556 |
+
[2840] link "Go to the main page of 'Wikipedia'" url: https://tigerai.ca/wikipedia_en_all_maxi_2022-05/
|
| 557 |
+
[2841] button 'Wikipedia'
|
| 558 |
+
[2842] link 'Go to a randomly selected page' url: https://tigerai.ca/random?content=wikipedia_en_all_maxi_2022-05
|
| 559 |
+
[2843] button '🎲'
|
| 560 |
+
[2850] heading 'Super Bowl ring' hasPopup: menu
|
| 561 |
+
[1669] StaticText 'The Super Bowl ring is an award in the '
|
| 562 |
+
[2855] link 'National Football League' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/National_Football_League
|
| 563 |
+
[2856] link 'Super Bowl' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl
|
| 564 |
+
[1675] StaticText '. Since only one '
|
| 565 |
+
[2857] link 'Vince Lombardi Trophy' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Vince_Lombardi_Trophy
|
| 566 |
+
[1677] StaticText ' is awarded to the team (ownership) itself, the Super Bowl ring offers a collectable memento for the actual players and team members to keep for themselves to symbolize their victory.'
|
| 567 |
+
[2859] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-1
|
| 568 |
+
[1678] StaticText ' There are also rings provided to the runners-up team of the Super Bowl.'
|
| 569 |
+
[2862] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-2
|
| 570 |
+
[2791] image '' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/I/Super_Bowl_XL_ring.jpg.webp
|
| 571 |
+
[1679] StaticText 'The Steelers '
|
| 572 |
+
[2868] link 'Super Bowl XL' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_XL
|
| 573 |
+
[1681] StaticText ' ring'
|
| 574 |
+
[2792] image '' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/I/Joetheismannrings.jpg.webp
|
| 575 |
+
[2872] link 'Joe Theismann' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Joe_Theismann
|
| 576 |
+
[2873] link 'Super Bowl XVII' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_XVII
|
| 577 |
+
[1685] StaticText ' Championship ring (right)'
|
| 578 |
+
[1686] StaticText 'Rings are also awarded to members of the team who wins the AFC or NFC championship.'
|
| 579 |
+
[2876] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-3
|
| 580 |
+
[2879] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-4
|
| 581 |
+
[2882] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-5
|
| 582 |
+
[1687] StaticText ' The NFL also provides postseason pay to all players as long as they have spent at least three games on their team’s active or inactive list; the playoff bonus money is egalitarian within a team among starters, backups, and injured players.'
|
| 583 |
+
[2885] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-6
|
| 584 |
+
[2887] group ''
|
| 585 |
+
[2892] DisclosureTriangle 'Details' expanded: True
|
| 586 |
+
[2893] heading 'Details'
|
| 587 |
+
[1690] StaticText 'These rings are typically made of '
|
| 588 |
+
[2895] link 'yellow' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Gold
|
| 589 |
+
[2896] link 'rose gold' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Rose_gold
|
| 590 |
+
[1694] StaticText ' with '
|
| 591 |
+
[2897] link 'diamonds' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Diamonds
|
| 592 |
+
[1696] StaticText '. They usually include the team name, team logo, the '
|
| 593 |
+
[2898] link 'phrase "World Champions"' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Major_professional_sports_leagues_in_the_United_States_and_Canada#Use_of_the_phrase_%22world_champions%22
|
| 594 |
+
[1698] StaticText ', and the Super Bowl number (usually indicated in '
|
| 595 |
+
[2899] link 'Roman numerals' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Roman_numeral
|
| 596 |
+
[1700] StaticText '). Many rings feature diamonds in the shape of the '
|
| 597 |
+
[2900] link 'Vince Lombardi Trophy' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Vince_Lombardi_Trophy
|
| 598 |
+
[1702] StaticText ' or a '
|
| 599 |
+
[2785] link 'football' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/American_football_ball
|
| 600 |
+
[1704] StaticText ', to illustrate the number of Super Bowls that the franchise has won.'
|
| 601 |
+
[2902] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-7
|
| 602 |
+
[2905] link '[]' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Super_Bowl_ring#cite_note-8
|
| 603 |
+
[1706] StaticText ' Most rings are manufactured by memorabilia company '
|
| 604 |
+
[2907] link 'Jostens' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Jostens
|
| 605 |
+
[1708] StaticText '. The rings often are normally presented in an elegant box and/or display case.'
|
| 606 |
+
[1709] StaticText 'The winning team can typically present any number of rings to whomever they choose, including usually, but not limited to: players (active roster, inactive roster, or '
|
| 607 |
+
[2909] link 'injured reserve' url: https://tigerai.ca/wiki/wikipedia_en_all_maxi_2022-05/A/Injured_reserve_list
|
| 608 |
+
[1711] StaticText '), coaches, trainers, executives, personnel, and general staff. Some teams have given rings to former players and coaches that were on the team at some point during the season, despite not having been on the winning roster for the Super Bowl itself.'
|
| 609 |
+
HISTORY_ACTION:
|
| 610 |
+
type [330] [Super Bowl rings] [1]
|
| 611 |
+
click [1252] [Super Bowl ring]
|
| 612 |
+
<|im_end|>
|
| 613 |
+
<|im_start|>assistant
|
| 614 |
+
<think><think>The current page is about Super Bowl rings, but does not yet provide the answer to who has the most individual Super Bowl rings. There is a section mentioning "Many rings feature diamonds in the shape of the Vince Lombardi Trophy or a football, to illustrate the number of Super Bowls that the franchise has won." This hints at individual player achievements, as opposed to team championships. To find out who has the most individual Super Bowl rings, I should look for a list or mention of individuals, possibly by scrolling further down to see if a list or table of record-holders appears.</think>
|
| 615 |
+
</think>
|
| 616 |
+
```scroll [down]```<|im_end|>
|
| 617 |
+
[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 1784, 26865, 16357, 1482, 2150, 374, 911, 7297, 20288, 24544, 11, 714, 1558, 537, 3602, 3410, 279, 4226, 311, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 13, 2619, 374, 264, 3772, 44291, 330, 8441, 24544, 4565, 48051, 304, 279, 6083, 315, 279, 55132, 62818, 36389, 55104, 476, 264, 8964, 11, 311, 40368, 279, 1372, 315, 7297, 18436, 4730, 429, 279, 19024, 702, 2765, 1189, 1096, 30643, 518, 3842, 2781, 32897, 11, 438, 16087, 311, 2083, 58184, 13, 2014, 1477, 700, 879, 702, 279, 1429, 3842, 7297, 20288, 24544, 11, 358, 1265, 1401, 369, 264, 1140, 476, 6286, 315, 7775, 11, 10767, 553, 37976, 4623, 1495, 311, 1490, 421, 264, 1140, 476, 1965, 315, 3255, 2832, 32120, 7952, 3918, 26865, 397, 522, 26865, 397, 73594, 12605, 508, 2923, 60, 73594, 151645]
|
| 618 |
+
[INFO:swift] [LABELS] [-100 * 2687]<think><think>The current page is about Super Bowl rings, but does not yet provide the answer to who has the most individual Super Bowl rings. There is a section mentioning "Many rings feature diamonds in the shape of the Vince Lombardi Trophy or a football, to illustrate the number of Super Bowls that the franchise has won." This hints at individual player achievements, as opposed to team championships. To find out who has the most individual Super Bowl rings, I should look for a list or mention of individuals, possibly by scrolling further down to see if a list or table of record-holders appears.</think>
|
| 619 |
+
</think>
|
| 620 |
+
```scroll [down]```<|im_end|>
|
| 621 |
+
[INFO:swift] Dataset Token Length: 2803.201644±911.025599, min=828.000000, max=13246.000000, size=23973
|
| 622 |
+
[INFO:swift] Dataset Token Length: 3108.476190±840.159544, min=1605.000000, max=4793.000000, size=21
|
| 623 |
+
[INFO:swift] The TrainArguments will be saved in: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/args.json
|
| 624 |
+
[INFO:swift] model: Qwen2ForCausalLM(
|
| 625 |
+
(model): Qwen2Model(
|
| 626 |
+
(embed_tokens): Embedding(152064, 3584)
|
| 627 |
+
(layers): ModuleList(
|
| 628 |
+
(0-27): 28 x Qwen2DecoderLayer(
|
| 629 |
+
(self_attn): Qwen2Attention(
|
| 630 |
+
(q_proj): Linear(in_features=3584, out_features=3584, bias=True)
|
| 631 |
+
(k_proj): Linear(in_features=3584, out_features=512, bias=True)
|
| 632 |
+
(v_proj): Linear(in_features=3584, out_features=512, bias=True)
|
| 633 |
+
(o_proj): Linear(in_features=3584, out_features=3584, bias=False)
|
| 634 |
+
)
|
| 635 |
+
(mlp): Qwen2MLP(
|
| 636 |
+
(gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
|
| 637 |
+
(up_proj): Linear(in_features=3584, out_features=18944, bias=False)
|
| 638 |
+
(down_proj): Linear(in_features=18944, out_features=3584, bias=False)
|
| 639 |
+
(act_fn): SiLU()
|
| 640 |
+
)
|
| 641 |
+
(input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
|
| 642 |
+
(post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
|
| 643 |
+
)
|
| 644 |
+
)
|
| 645 |
+
(norm): Qwen2RMSNorm((0,), eps=1e-06)
|
| 646 |
+
(rotary_emb): Qwen2RotaryEmbedding()
|
| 647 |
+
)
|
| 648 |
+
(lm_head): Linear(in_features=3584, out_features=152064, bias=False)
|
| 649 |
+
)
|
| 650 |
+
[INFO:swift] model_parameter_info: Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.
|
| 651 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 652 |
+
super().__init__(
|
| 653 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 654 |
+
super().__init__(
|
| 655 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 656 |
+
super().__init__(
|
| 657 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 658 |
+
super().__init__(
|
| 659 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 660 |
+
super().__init__(
|
| 661 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 662 |
+
super().__init__(
|
| 663 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 664 |
+
super().__init__(
|
| 665 |
+
/group/40143/hongzhuyi/ms-swift/swift/trainers/mixin.py:104: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.
|
| 666 |
+
super().__init__(
|
| 667 |
+
Detected kernel version 5.4.241, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
| 668 |
+
[INFO:swift] use_reentrant: True
|
| 669 |
+
[INFO:swift] The logging file will be saved in: /group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/logging.jsonl
|
| 670 |
+
Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 4. Using DeepSpeed's value.
|
| 671 |
+
Parameter Offload - Persistent parameters statistics: param_count = 141, numel = 333312
|
| 672 |
+
|
| 673 |
+
|
| 674 |
|
| 675 |
+
|
| 676 |
|
| 677 |
+
|
log/20250917-13:49:21.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v5-20250917-134655/args.json
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "epoch",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 2,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 4,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 5e-06,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 2.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 1,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "epoch",
|
| 38 |
+
"save_steps": 500,
|
| 39 |
+
"save_total_limit": null,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"use_ipex": false,
|
| 51 |
+
"bf16": true,
|
| 52 |
+
"fp16": false,
|
| 53 |
+
"fp16_opt_level": "O1",
|
| 54 |
+
"half_precision_backend": "auto",
|
| 55 |
+
"bf16_full_eval": false,
|
| 56 |
+
"fp16_full_eval": false,
|
| 57 |
+
"tf32": null,
|
| 58 |
+
"local_rank": 0,
|
| 59 |
+
"ddp_backend": null,
|
| 60 |
+
"tpu_num_cores": null,
|
| 61 |
+
"tpu_metrics_debug": false,
|
| 62 |
+
"debug": null,
|
| 63 |
+
"dataloader_drop_last": false,
|
| 64 |
+
"eval_steps": 2000.0,
|
| 65 |
+
"dataloader_num_workers": 48,
|
| 66 |
+
"dataloader_prefetch_factor": null,
|
| 67 |
+
"past_index": -1,
|
| 68 |
+
"run_name": "/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655",
|
| 69 |
+
"disable_tqdm": null,
|
| 70 |
+
"remove_unused_columns": true,
|
| 71 |
+
"label_names": null,
|
| 72 |
+
"load_best_model_at_end": false,
|
| 73 |
+
"metric_for_best_model": "loss",
|
| 74 |
+
"greater_is_better": false,
|
| 75 |
+
"ignore_data_skip": false,
|
| 76 |
+
"fsdp": "",
|
| 77 |
+
"fsdp_min_num_params": 0,
|
| 78 |
+
"fsdp_config": null,
|
| 79 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 80 |
+
"accelerator_config": {
|
| 81 |
+
"dispatch_batches": false
|
| 82 |
+
},
|
| 83 |
+
"deepspeed": {
|
| 84 |
+
"fp16": {
|
| 85 |
+
"enabled": "auto",
|
| 86 |
+
"loss_scale": 0,
|
| 87 |
+
"loss_scale_window": 1000,
|
| 88 |
+
"initial_scale_power": 16,
|
| 89 |
+
"hysteresis": 2,
|
| 90 |
+
"min_loss_scale": 1
|
| 91 |
+
},
|
| 92 |
+
"bf16": {
|
| 93 |
+
"enabled": "auto"
|
| 94 |
+
},
|
| 95 |
+
"zero_optimization": {
|
| 96 |
+
"stage": 3,
|
| 97 |
+
"offload_optimizer": {
|
| 98 |
+
"device": "none",
|
| 99 |
+
"pin_memory": true
|
| 100 |
+
},
|
| 101 |
+
"offload_param": {
|
| 102 |
+
"device": "none",
|
| 103 |
+
"pin_memory": true
|
| 104 |
+
},
|
| 105 |
+
"overlap_comm": false,
|
| 106 |
+
"contiguous_gradients": true,
|
| 107 |
+
"sub_group_size": 1000000000.0,
|
| 108 |
+
"reduce_bucket_size": "auto",
|
| 109 |
+
"zero_quantized_weights": false,
|
| 110 |
+
"zero_quantized_gradients": false,
|
| 111 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 112 |
+
"stage3_param_persistence_threshold": "auto",
|
| 113 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 114 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 115 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 116 |
+
},
|
| 117 |
+
"gradient_accumulation_steps": "auto",
|
| 118 |
+
"gradient_clipping": "auto",
|
| 119 |
+
"steps_per_print": 2000,
|
| 120 |
+
"train_batch_size": "auto",
|
| 121 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 122 |
+
"wall_clock_breakdown": false
|
| 123 |
+
},
|
| 124 |
+
"label_smoothing_factor": 0.0,
|
| 125 |
+
"optim": "adamw_torch_fused",
|
| 126 |
+
"optim_args": null,
|
| 127 |
+
"adafactor": false,
|
| 128 |
+
"group_by_length": false,
|
| 129 |
+
"length_column_name": "length",
|
| 130 |
+
"report_to": [
|
| 131 |
+
"tensorboard"
|
| 132 |
+
],
|
| 133 |
+
"ddp_find_unused_parameters": null,
|
| 134 |
+
"ddp_bucket_cap_mb": null,
|
| 135 |
+
"ddp_broadcast_buffers": null,
|
| 136 |
+
"dataloader_pin_memory": true,
|
| 137 |
+
"dataloader_persistent_workers": false,
|
| 138 |
+
"skip_memory_metrics": true,
|
| 139 |
+
"use_legacy_prediction_loop": false,
|
| 140 |
+
"push_to_hub": false,
|
| 141 |
+
"resume_from_checkpoint": null,
|
| 142 |
+
"hub_model_id": null,
|
| 143 |
+
"hub_strategy": "every_save",
|
| 144 |
+
"hub_token": null,
|
| 145 |
+
"hub_private_repo": null,
|
| 146 |
+
"hub_always_push": false,
|
| 147 |
+
"hub_revision": null,
|
| 148 |
+
"gradient_checkpointing": true,
|
| 149 |
+
"gradient_checkpointing_kwargs": null,
|
| 150 |
+
"include_inputs_for_metrics": false,
|
| 151 |
+
"include_for_metrics": [],
|
| 152 |
+
"eval_do_concat_batches": true,
|
| 153 |
+
"fp16_backend": "auto",
|
| 154 |
+
"push_to_hub_model_id": null,
|
| 155 |
+
"push_to_hub_organization": null,
|
| 156 |
+
"push_to_hub_token": null,
|
| 157 |
+
"mp_parameters": "",
|
| 158 |
+
"auto_find_batch_size": false,
|
| 159 |
+
"full_determinism": false,
|
| 160 |
+
"torchdynamo": null,
|
| 161 |
+
"ray_scope": "last",
|
| 162 |
+
"ddp_timeout": 18000000,
|
| 163 |
+
"torch_compile": false,
|
| 164 |
+
"torch_compile_backend": null,
|
| 165 |
+
"torch_compile_mode": null,
|
| 166 |
+
"include_tokens_per_second": false,
|
| 167 |
+
"include_num_input_tokens_seen": false,
|
| 168 |
+
"neftune_noise_alpha": null,
|
| 169 |
+
"optim_target_modules": null,
|
| 170 |
+
"batch_eval_metrics": false,
|
| 171 |
+
"eval_on_start": false,
|
| 172 |
+
"use_liger_kernel": false,
|
| 173 |
+
"liger_kernel_config": null,
|
| 174 |
+
"eval_use_gather_object": false,
|
| 175 |
+
"average_tokens_across_devices": true,
|
| 176 |
+
"sortish_sampler": false,
|
| 177 |
+
"predict_with_generate": false,
|
| 178 |
+
"generation_max_length": null,
|
| 179 |
+
"generation_num_beams": null,
|
| 180 |
+
"generation_config": null,
|
| 181 |
+
"tuner_backend": "peft",
|
| 182 |
+
"vit_gradient_checkpointing": null,
|
| 183 |
+
"router_aux_loss_coef": 0.0,
|
| 184 |
+
"enable_dft_loss": false,
|
| 185 |
+
"check_model": true,
|
| 186 |
+
"acc_strategy": "token",
|
| 187 |
+
"train_dataloader_shuffle": true,
|
| 188 |
+
"max_epochs": null,
|
| 189 |
+
"aligner_lr": null,
|
| 190 |
+
"vit_lr": null,
|
| 191 |
+
"use_logits_to_keep": null,
|
| 192 |
+
"channels": null,
|
| 193 |
+
"ds3_gather_for_generation": true,
|
| 194 |
+
"resume_only_model": false,
|
| 195 |
+
"optimizer": null,
|
| 196 |
+
"loss_type": null,
|
| 197 |
+
"metric": null,
|
| 198 |
+
"eval_use_evalscope": false,
|
| 199 |
+
"eval_dataset": [],
|
| 200 |
+
"eval_dataset_args": null,
|
| 201 |
+
"eval_limit": null,
|
| 202 |
+
"eval_generation_config": null,
|
| 203 |
+
"extra_eval_args": null,
|
| 204 |
+
"use_flash_ckpt": false,
|
| 205 |
+
"model": "Qwen/Qwen2.5-7B-Instruct",
|
| 206 |
+
"model_type": "qwen2_5",
|
| 207 |
+
"model_revision": null,
|
| 208 |
+
"task_type": "causal_lm",
|
| 209 |
+
"torch_dtype": "bfloat16",
|
| 210 |
+
"attn_impl": null,
|
| 211 |
+
"new_special_tokens": [],
|
| 212 |
+
"num_labels": null,
|
| 213 |
+
"problem_type": null,
|
| 214 |
+
"rope_scaling": null,
|
| 215 |
+
"device_map": null,
|
| 216 |
+
"max_memory": {},
|
| 217 |
+
"max_model_len": null,
|
| 218 |
+
"local_repo_path": null,
|
| 219 |
+
"init_strategy": null,
|
| 220 |
+
"template": "qwen2_5",
|
| 221 |
+
"system": null,
|
| 222 |
+
"max_length": 16240,
|
| 223 |
+
"truncation_strategy": "delete",
|
| 224 |
+
"max_pixels": null,
|
| 225 |
+
"agent_template": null,
|
| 226 |
+
"norm_bbox": null,
|
| 227 |
+
"use_chat_template": true,
|
| 228 |
+
"padding_free": false,
|
| 229 |
+
"padding_side": "right",
|
| 230 |
+
"loss_scale": "default",
|
| 231 |
+
"sequence_parallel_size": 1,
|
| 232 |
+
"response_prefix": null,
|
| 233 |
+
"template_backend": "swift",
|
| 234 |
+
"dataset": [
|
| 235 |
+
"/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl",
|
| 236 |
+
"/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl",
|
| 237 |
+
"/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl",
|
| 238 |
+
"/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl",
|
| 239 |
+
"/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl"
|
| 240 |
+
],
|
| 241 |
+
"val_dataset": [],
|
| 242 |
+
"split_dataset_ratio": 0.001,
|
| 243 |
+
"dataset_num_proc": 100,
|
| 244 |
+
"load_from_cache_file": true,
|
| 245 |
+
"dataset_shuffle": true,
|
| 246 |
+
"val_dataset_shuffle": false,
|
| 247 |
+
"streaming": false,
|
| 248 |
+
"interleave_prob": null,
|
| 249 |
+
"stopping_strategy": "first_exhausted",
|
| 250 |
+
"shuffle_buffer_size": 1000,
|
| 251 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 252 |
+
"columns": {},
|
| 253 |
+
"strict": false,
|
| 254 |
+
"model_name": null,
|
| 255 |
+
"model_author": null,
|
| 256 |
+
"custom_dataset_info": [],
|
| 257 |
+
"quant_method": null,
|
| 258 |
+
"quant_bits": null,
|
| 259 |
+
"hqq_axis": null,
|
| 260 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 261 |
+
"bnb_4bit_quant_type": "nf4",
|
| 262 |
+
"bnb_4bit_use_double_quant": true,
|
| 263 |
+
"bnb_4bit_quant_storage": null,
|
| 264 |
+
"max_new_tokens": 64,
|
| 265 |
+
"temperature": 0.0,
|
| 266 |
+
"top_k": null,
|
| 267 |
+
"top_p": null,
|
| 268 |
+
"repetition_penalty": null,
|
| 269 |
+
"num_beams": 1,
|
| 270 |
+
"stream": false,
|
| 271 |
+
"stop_words": [],
|
| 272 |
+
"logprobs": false,
|
| 273 |
+
"top_logprobs": null,
|
| 274 |
+
"ckpt_dir": null,
|
| 275 |
+
"lora_modules": [],
|
| 276 |
+
"train_type": "full",
|
| 277 |
+
"adapters": [],
|
| 278 |
+
"external_plugins": [],
|
| 279 |
+
"model_kwargs": {},
|
| 280 |
+
"load_args": false,
|
| 281 |
+
"load_data_args": false,
|
| 282 |
+
"packing": false,
|
| 283 |
+
"packing_length": null,
|
| 284 |
+
"lazy_tokenize": false,
|
| 285 |
+
"cached_dataset": [],
|
| 286 |
+
"custom_register_path": [],
|
| 287 |
+
"use_hf": false,
|
| 288 |
+
"ignore_args_error": false,
|
| 289 |
+
"use_swift_lora": false,
|
| 290 |
+
"freeze_parameters": [],
|
| 291 |
+
"freeze_parameters_regex": null,
|
| 292 |
+
"freeze_parameters_ratio": 0.0,
|
| 293 |
+
"trainable_parameters": [],
|
| 294 |
+
"trainable_parameters_regex": null,
|
| 295 |
+
"freeze_llm": false,
|
| 296 |
+
"freeze_vit": true,
|
| 297 |
+
"freeze_aligner": false,
|
| 298 |
+
"target_modules": [
|
| 299 |
+
"all-linear"
|
| 300 |
+
],
|
| 301 |
+
"target_regex": null,
|
| 302 |
+
"modules_to_save": [],
|
| 303 |
+
"lora_rank": 8,
|
| 304 |
+
"lora_alpha": 32,
|
| 305 |
+
"lora_dropout": 0.05,
|
| 306 |
+
"lora_bias": "none",
|
| 307 |
+
"lora_dtype": null,
|
| 308 |
+
"lorap_lr_ratio": null,
|
| 309 |
+
"use_rslora": false,
|
| 310 |
+
"use_dora": false,
|
| 311 |
+
"lora_ga_batch_size": 2,
|
| 312 |
+
"lora_ga_iters": 2,
|
| 313 |
+
"lora_ga_max_length": 1024,
|
| 314 |
+
"lora_ga_direction": "ArB2r",
|
| 315 |
+
"lora_ga_scale": "stable",
|
| 316 |
+
"lora_ga_stable_gamma": 16,
|
| 317 |
+
"init_weights": true,
|
| 318 |
+
"fourier_n_frequency": 2000,
|
| 319 |
+
"fourier_scaling": 300.0,
|
| 320 |
+
"boft_block_size": 4,
|
| 321 |
+
"boft_block_num": 0,
|
| 322 |
+
"boft_n_butterfly_factor": 1,
|
| 323 |
+
"boft_dropout": 0.0,
|
| 324 |
+
"vera_rank": 256,
|
| 325 |
+
"vera_projection_prng_key": 0,
|
| 326 |
+
"vera_dropout": 0.0,
|
| 327 |
+
"vera_d_initial": 0.1,
|
| 328 |
+
"adapter_act": "gelu",
|
| 329 |
+
"adapter_length": 128,
|
| 330 |
+
"use_galore": false,
|
| 331 |
+
"galore_target_modules": null,
|
| 332 |
+
"galore_rank": 128,
|
| 333 |
+
"galore_update_proj_gap": 50,
|
| 334 |
+
"galore_scale": 1.0,
|
| 335 |
+
"galore_proj_type": "std",
|
| 336 |
+
"galore_optim_per_parameter": false,
|
| 337 |
+
"galore_with_embedding": false,
|
| 338 |
+
"galore_quantization": false,
|
| 339 |
+
"galore_proj_quant": false,
|
| 340 |
+
"galore_proj_bits": 4,
|
| 341 |
+
"galore_proj_group_size": 256,
|
| 342 |
+
"galore_cos_threshold": 0.4,
|
| 343 |
+
"galore_gamma_proj": 2,
|
| 344 |
+
"galore_queue_size": 5,
|
| 345 |
+
"adalora_target_r": 8,
|
| 346 |
+
"adalora_init_r": 12,
|
| 347 |
+
"adalora_tinit": 0,
|
| 348 |
+
"adalora_tfinal": 0,
|
| 349 |
+
"adalora_deltaT": 1,
|
| 350 |
+
"adalora_beta1": 0.85,
|
| 351 |
+
"adalora_beta2": 0.85,
|
| 352 |
+
"adalora_orth_reg_weight": 0.5,
|
| 353 |
+
"llamapro_num_new_blocks": 4,
|
| 354 |
+
"llamapro_num_groups": null,
|
| 355 |
+
"lisa_activated_layers": 0,
|
| 356 |
+
"lisa_step_interval": 20,
|
| 357 |
+
"reft_layer_key": null,
|
| 358 |
+
"reft_layers": null,
|
| 359 |
+
"reft_rank": 4,
|
| 360 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 361 |
+
"reft_args": null,
|
| 362 |
+
"swanlab_token": null,
|
| 363 |
+
"swanlab_project": null,
|
| 364 |
+
"swanlab_workspace": null,
|
| 365 |
+
"swanlab_exp_name": null,
|
| 366 |
+
"swanlab_lark_webhook_url": null,
|
| 367 |
+
"swanlab_lark_secret": null,
|
| 368 |
+
"swanlab_mode": "cloud",
|
| 369 |
+
"add_version": true,
|
| 370 |
+
"create_checkpoint_symlink": false,
|
| 371 |
+
"zero_hpz_partition_size": null,
|
| 372 |
+
"deepspeed_autotp_size": null,
|
| 373 |
+
"early_stop_interval": null,
|
| 374 |
+
"rank": 0,
|
| 375 |
+
"global_world_size": 8,
|
| 376 |
+
"local_world_size": 8,
|
| 377 |
+
"model_suffix": "Qwen2.5-7B-Instruct",
|
| 378 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
|
| 379 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7fbf902b9ab0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
|
| 380 |
+
"model_dir": "/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct",
|
| 381 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 382 |
+
"evaluation_strategy": "epoch",
|
| 383 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v5-20250917-134655', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
|
| 384 |
+
}
|
v5-20250917-134655/images/train_epoch.png
ADDED
|
v5-20250917-134655/images/train_grad_norm.png
ADDED
|
v5-20250917-134655/images/train_learning_rate.png
ADDED
|
v5-20250917-134655/images/train_loss.png
ADDED
|
v5-20250917-134655/images/train_token_acc.png
ADDED
|
v5-20250917-134655/logging.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"loss": 1.27844775, "grad_norm": 25.99561637, "learning_rate": 1.3e-07, "token_acc": 0.73113072, "epoch": 0.00267023, "global_step/max_steps": "1/750", "percentage": "0.13%", "elapsed_time": "22s", "remaining_time": "4h 39m 20s", "memory(GiB)": 25.14, "train_speed(iter/s)": 0.044689}
|
| 2 |
+
{"loss": 1.27462864, "grad_norm": 25.99462207, "learning_rate": 2.6e-07, "token_acc": 0.72517288, "epoch": 0.00534045, "global_step/max_steps": "2/750", "percentage": "0.27%", "elapsed_time": "49s", "remaining_time": "5h 8m 18s", "memory(GiB)": 26.61, "train_speed(iter/s)": 0.040436}
|
| 3 |
+
{"train_dataset": "2803.201644±911.025599, min=828.000000, max=13246.000000, size=23973", "val_dataset": "3108.476190±840.159544, min=1605.000000, max=4793.000000, size=21", "model_parameter_info": "Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.", "last_model_checkpoint": null, "best_model_checkpoint": null, "best_metric": null, "global_step": 2, "log_history": [{"loss": 1.2784477472305298, "grad_norm": 25.995616372803973, "learning_rate": 1.3157894736842107e-07, "token_acc": 0.7311307191848755, "epoch": 0.0026702269692923898, "step": 1}, {"loss": 1.2746286392211914, "grad_norm": 25.994622070063752, "learning_rate": 2.6315789473684213e-07, "token_acc": 0.7251728773117065, "epoch": 0.0053404539385847796, "step": 2}], "memory": 26.607421875}
|
v5-20250917-134655/runs/events.out.tfevents.1758088071.TENCENT64.site.218247.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ae03fcdb066801b87d2feb5ea699f32778b8d57b4645456cf7f8b52e9c6bace
|
| 3 |
+
size 8344
|
v5-20250917-134655/val_dataset.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
v6-20250917-134949/args.json
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "epoch",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 2,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 4,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 5e-06,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 2.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 1,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "epoch",
|
| 38 |
+
"save_steps": 500,
|
| 39 |
+
"save_total_limit": null,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"use_ipex": false,
|
| 51 |
+
"bf16": true,
|
| 52 |
+
"fp16": false,
|
| 53 |
+
"fp16_opt_level": "O1",
|
| 54 |
+
"half_precision_backend": "auto",
|
| 55 |
+
"bf16_full_eval": false,
|
| 56 |
+
"fp16_full_eval": false,
|
| 57 |
+
"tf32": null,
|
| 58 |
+
"local_rank": 0,
|
| 59 |
+
"ddp_backend": null,
|
| 60 |
+
"tpu_num_cores": null,
|
| 61 |
+
"tpu_metrics_debug": false,
|
| 62 |
+
"debug": null,
|
| 63 |
+
"dataloader_drop_last": false,
|
| 64 |
+
"eval_steps": 2000.0,
|
| 65 |
+
"dataloader_num_workers": 48,
|
| 66 |
+
"dataloader_prefetch_factor": null,
|
| 67 |
+
"past_index": -1,
|
| 68 |
+
"run_name": "/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949",
|
| 69 |
+
"disable_tqdm": null,
|
| 70 |
+
"remove_unused_columns": true,
|
| 71 |
+
"label_names": null,
|
| 72 |
+
"load_best_model_at_end": false,
|
| 73 |
+
"metric_for_best_model": "loss",
|
| 74 |
+
"greater_is_better": false,
|
| 75 |
+
"ignore_data_skip": false,
|
| 76 |
+
"fsdp": "",
|
| 77 |
+
"fsdp_min_num_params": 0,
|
| 78 |
+
"fsdp_config": null,
|
| 79 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 80 |
+
"accelerator_config": {
|
| 81 |
+
"dispatch_batches": false
|
| 82 |
+
},
|
| 83 |
+
"deepspeed": {
|
| 84 |
+
"fp16": {
|
| 85 |
+
"enabled": "auto",
|
| 86 |
+
"loss_scale": 0,
|
| 87 |
+
"loss_scale_window": 1000,
|
| 88 |
+
"initial_scale_power": 16,
|
| 89 |
+
"hysteresis": 2,
|
| 90 |
+
"min_loss_scale": 1
|
| 91 |
+
},
|
| 92 |
+
"bf16": {
|
| 93 |
+
"enabled": "auto"
|
| 94 |
+
},
|
| 95 |
+
"zero_optimization": {
|
| 96 |
+
"stage": 3,
|
| 97 |
+
"offload_optimizer": {
|
| 98 |
+
"device": "none",
|
| 99 |
+
"pin_memory": true
|
| 100 |
+
},
|
| 101 |
+
"offload_param": {
|
| 102 |
+
"device": "none",
|
| 103 |
+
"pin_memory": true
|
| 104 |
+
},
|
| 105 |
+
"overlap_comm": false,
|
| 106 |
+
"contiguous_gradients": true,
|
| 107 |
+
"sub_group_size": 1000000000.0,
|
| 108 |
+
"reduce_bucket_size": "auto",
|
| 109 |
+
"zero_quantized_weights": false,
|
| 110 |
+
"zero_quantized_gradients": false,
|
| 111 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 112 |
+
"stage3_param_persistence_threshold": "auto",
|
| 113 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 114 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 115 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 116 |
+
},
|
| 117 |
+
"gradient_accumulation_steps": "auto",
|
| 118 |
+
"gradient_clipping": "auto",
|
| 119 |
+
"steps_per_print": 2000,
|
| 120 |
+
"train_batch_size": "auto",
|
| 121 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 122 |
+
"wall_clock_breakdown": false
|
| 123 |
+
},
|
| 124 |
+
"label_smoothing_factor": 0.0,
|
| 125 |
+
"optim": "adamw_torch_fused",
|
| 126 |
+
"optim_args": null,
|
| 127 |
+
"adafactor": false,
|
| 128 |
+
"group_by_length": false,
|
| 129 |
+
"length_column_name": "length",
|
| 130 |
+
"report_to": [
|
| 131 |
+
"tensorboard"
|
| 132 |
+
],
|
| 133 |
+
"ddp_find_unused_parameters": null,
|
| 134 |
+
"ddp_bucket_cap_mb": null,
|
| 135 |
+
"ddp_broadcast_buffers": null,
|
| 136 |
+
"dataloader_pin_memory": true,
|
| 137 |
+
"dataloader_persistent_workers": false,
|
| 138 |
+
"skip_memory_metrics": true,
|
| 139 |
+
"use_legacy_prediction_loop": false,
|
| 140 |
+
"push_to_hub": false,
|
| 141 |
+
"resume_from_checkpoint": null,
|
| 142 |
+
"hub_model_id": null,
|
| 143 |
+
"hub_strategy": "every_save",
|
| 144 |
+
"hub_token": null,
|
| 145 |
+
"hub_private_repo": null,
|
| 146 |
+
"hub_always_push": false,
|
| 147 |
+
"hub_revision": null,
|
| 148 |
+
"gradient_checkpointing": true,
|
| 149 |
+
"gradient_checkpointing_kwargs": null,
|
| 150 |
+
"include_inputs_for_metrics": false,
|
| 151 |
+
"include_for_metrics": [],
|
| 152 |
+
"eval_do_concat_batches": true,
|
| 153 |
+
"fp16_backend": "auto",
|
| 154 |
+
"push_to_hub_model_id": null,
|
| 155 |
+
"push_to_hub_organization": null,
|
| 156 |
+
"push_to_hub_token": null,
|
| 157 |
+
"mp_parameters": "",
|
| 158 |
+
"auto_find_batch_size": false,
|
| 159 |
+
"full_determinism": false,
|
| 160 |
+
"torchdynamo": null,
|
| 161 |
+
"ray_scope": "last",
|
| 162 |
+
"ddp_timeout": 18000000,
|
| 163 |
+
"torch_compile": false,
|
| 164 |
+
"torch_compile_backend": null,
|
| 165 |
+
"torch_compile_mode": null,
|
| 166 |
+
"include_tokens_per_second": false,
|
| 167 |
+
"include_num_input_tokens_seen": false,
|
| 168 |
+
"neftune_noise_alpha": null,
|
| 169 |
+
"optim_target_modules": null,
|
| 170 |
+
"batch_eval_metrics": false,
|
| 171 |
+
"eval_on_start": false,
|
| 172 |
+
"use_liger_kernel": false,
|
| 173 |
+
"liger_kernel_config": null,
|
| 174 |
+
"eval_use_gather_object": false,
|
| 175 |
+
"average_tokens_across_devices": true,
|
| 176 |
+
"sortish_sampler": false,
|
| 177 |
+
"predict_with_generate": false,
|
| 178 |
+
"generation_max_length": null,
|
| 179 |
+
"generation_num_beams": null,
|
| 180 |
+
"generation_config": null,
|
| 181 |
+
"tuner_backend": "peft",
|
| 182 |
+
"vit_gradient_checkpointing": null,
|
| 183 |
+
"router_aux_loss_coef": 0.0,
|
| 184 |
+
"enable_dft_loss": false,
|
| 185 |
+
"check_model": true,
|
| 186 |
+
"acc_strategy": "token",
|
| 187 |
+
"train_dataloader_shuffle": true,
|
| 188 |
+
"max_epochs": null,
|
| 189 |
+
"aligner_lr": null,
|
| 190 |
+
"vit_lr": null,
|
| 191 |
+
"use_logits_to_keep": null,
|
| 192 |
+
"channels": null,
|
| 193 |
+
"ds3_gather_for_generation": true,
|
| 194 |
+
"resume_only_model": false,
|
| 195 |
+
"optimizer": null,
|
| 196 |
+
"loss_type": null,
|
| 197 |
+
"metric": null,
|
| 198 |
+
"eval_use_evalscope": false,
|
| 199 |
+
"eval_dataset": [],
|
| 200 |
+
"eval_dataset_args": null,
|
| 201 |
+
"eval_limit": null,
|
| 202 |
+
"eval_generation_config": null,
|
| 203 |
+
"extra_eval_args": null,
|
| 204 |
+
"use_flash_ckpt": false,
|
| 205 |
+
"model": "Qwen/Qwen2.5-7B-Instruct",
|
| 206 |
+
"model_type": "qwen2_5",
|
| 207 |
+
"model_revision": null,
|
| 208 |
+
"task_type": "causal_lm",
|
| 209 |
+
"torch_dtype": "bfloat16",
|
| 210 |
+
"attn_impl": null,
|
| 211 |
+
"new_special_tokens": [],
|
| 212 |
+
"num_labels": null,
|
| 213 |
+
"problem_type": null,
|
| 214 |
+
"rope_scaling": null,
|
| 215 |
+
"device_map": null,
|
| 216 |
+
"max_memory": {},
|
| 217 |
+
"max_model_len": null,
|
| 218 |
+
"local_repo_path": null,
|
| 219 |
+
"init_strategy": null,
|
| 220 |
+
"template": "qwen2_5",
|
| 221 |
+
"system": null,
|
| 222 |
+
"max_length": 16240,
|
| 223 |
+
"truncation_strategy": "delete",
|
| 224 |
+
"max_pixels": null,
|
| 225 |
+
"agent_template": null,
|
| 226 |
+
"norm_bbox": null,
|
| 227 |
+
"use_chat_template": true,
|
| 228 |
+
"padding_free": false,
|
| 229 |
+
"padding_side": "right",
|
| 230 |
+
"loss_scale": "default",
|
| 231 |
+
"sequence_parallel_size": 1,
|
| 232 |
+
"response_prefix": null,
|
| 233 |
+
"template_backend": "swift",
|
| 234 |
+
"dataset": [
|
| 235 |
+
"/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_2083q_0.8_swift.jsonl",
|
| 236 |
+
"/group/40143/hongzhuyi/ms-swift/data/corr_hotpot_new1369q_format_0.8_swift.jsonl",
|
| 237 |
+
"/group/40143/hongzhuyi/ms-swift/data/corr_nq_2225q_0.8_swift.jsonl",
|
| 238 |
+
"/group/40143/hongzhuyi/ms-swift/data/self_2000_2000_1369_4_hp673_swift.jsonl",
|
| 239 |
+
"/group/40143/hongzhuyi/ms-swift/self_2000_2000_1369_4_nq400_noinfo_swift.jsonl"
|
| 240 |
+
],
|
| 241 |
+
"val_dataset": [],
|
| 242 |
+
"split_dataset_ratio": 0.001,
|
| 243 |
+
"dataset_num_proc": 100,
|
| 244 |
+
"load_from_cache_file": true,
|
| 245 |
+
"dataset_shuffle": true,
|
| 246 |
+
"val_dataset_shuffle": false,
|
| 247 |
+
"streaming": false,
|
| 248 |
+
"interleave_prob": null,
|
| 249 |
+
"stopping_strategy": "first_exhausted",
|
| 250 |
+
"shuffle_buffer_size": 1000,
|
| 251 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 252 |
+
"columns": {},
|
| 253 |
+
"strict": false,
|
| 254 |
+
"model_name": null,
|
| 255 |
+
"model_author": null,
|
| 256 |
+
"custom_dataset_info": [],
|
| 257 |
+
"quant_method": null,
|
| 258 |
+
"quant_bits": null,
|
| 259 |
+
"hqq_axis": null,
|
| 260 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 261 |
+
"bnb_4bit_quant_type": "nf4",
|
| 262 |
+
"bnb_4bit_use_double_quant": true,
|
| 263 |
+
"bnb_4bit_quant_storage": null,
|
| 264 |
+
"max_new_tokens": 64,
|
| 265 |
+
"temperature": 0.0,
|
| 266 |
+
"top_k": null,
|
| 267 |
+
"top_p": null,
|
| 268 |
+
"repetition_penalty": null,
|
| 269 |
+
"num_beams": 1,
|
| 270 |
+
"stream": false,
|
| 271 |
+
"stop_words": [],
|
| 272 |
+
"logprobs": false,
|
| 273 |
+
"top_logprobs": null,
|
| 274 |
+
"ckpt_dir": null,
|
| 275 |
+
"lora_modules": [],
|
| 276 |
+
"train_type": "full",
|
| 277 |
+
"adapters": [],
|
| 278 |
+
"external_plugins": [],
|
| 279 |
+
"model_kwargs": {},
|
| 280 |
+
"load_args": false,
|
| 281 |
+
"load_data_args": false,
|
| 282 |
+
"packing": false,
|
| 283 |
+
"packing_length": null,
|
| 284 |
+
"lazy_tokenize": false,
|
| 285 |
+
"cached_dataset": [],
|
| 286 |
+
"custom_register_path": [],
|
| 287 |
+
"use_hf": false,
|
| 288 |
+
"ignore_args_error": false,
|
| 289 |
+
"use_swift_lora": false,
|
| 290 |
+
"freeze_parameters": [],
|
| 291 |
+
"freeze_parameters_regex": null,
|
| 292 |
+
"freeze_parameters_ratio": 0.0,
|
| 293 |
+
"trainable_parameters": [],
|
| 294 |
+
"trainable_parameters_regex": null,
|
| 295 |
+
"freeze_llm": false,
|
| 296 |
+
"freeze_vit": true,
|
| 297 |
+
"freeze_aligner": false,
|
| 298 |
+
"target_modules": [
|
| 299 |
+
"all-linear"
|
| 300 |
+
],
|
| 301 |
+
"target_regex": null,
|
| 302 |
+
"modules_to_save": [],
|
| 303 |
+
"lora_rank": 8,
|
| 304 |
+
"lora_alpha": 32,
|
| 305 |
+
"lora_dropout": 0.05,
|
| 306 |
+
"lora_bias": "none",
|
| 307 |
+
"lora_dtype": null,
|
| 308 |
+
"lorap_lr_ratio": null,
|
| 309 |
+
"use_rslora": false,
|
| 310 |
+
"use_dora": false,
|
| 311 |
+
"lora_ga_batch_size": 2,
|
| 312 |
+
"lora_ga_iters": 2,
|
| 313 |
+
"lora_ga_max_length": 1024,
|
| 314 |
+
"lora_ga_direction": "ArB2r",
|
| 315 |
+
"lora_ga_scale": "stable",
|
| 316 |
+
"lora_ga_stable_gamma": 16,
|
| 317 |
+
"init_weights": true,
|
| 318 |
+
"fourier_n_frequency": 2000,
|
| 319 |
+
"fourier_scaling": 300.0,
|
| 320 |
+
"boft_block_size": 4,
|
| 321 |
+
"boft_block_num": 0,
|
| 322 |
+
"boft_n_butterfly_factor": 1,
|
| 323 |
+
"boft_dropout": 0.0,
|
| 324 |
+
"vera_rank": 256,
|
| 325 |
+
"vera_projection_prng_key": 0,
|
| 326 |
+
"vera_dropout": 0.0,
|
| 327 |
+
"vera_d_initial": 0.1,
|
| 328 |
+
"adapter_act": "gelu",
|
| 329 |
+
"adapter_length": 128,
|
| 330 |
+
"use_galore": false,
|
| 331 |
+
"galore_target_modules": null,
|
| 332 |
+
"galore_rank": 128,
|
| 333 |
+
"galore_update_proj_gap": 50,
|
| 334 |
+
"galore_scale": 1.0,
|
| 335 |
+
"galore_proj_type": "std",
|
| 336 |
+
"galore_optim_per_parameter": false,
|
| 337 |
+
"galore_with_embedding": false,
|
| 338 |
+
"galore_quantization": false,
|
| 339 |
+
"galore_proj_quant": false,
|
| 340 |
+
"galore_proj_bits": 4,
|
| 341 |
+
"galore_proj_group_size": 256,
|
| 342 |
+
"galore_cos_threshold": 0.4,
|
| 343 |
+
"galore_gamma_proj": 2,
|
| 344 |
+
"galore_queue_size": 5,
|
| 345 |
+
"adalora_target_r": 8,
|
| 346 |
+
"adalora_init_r": 12,
|
| 347 |
+
"adalora_tinit": 0,
|
| 348 |
+
"adalora_tfinal": 0,
|
| 349 |
+
"adalora_deltaT": 1,
|
| 350 |
+
"adalora_beta1": 0.85,
|
| 351 |
+
"adalora_beta2": 0.85,
|
| 352 |
+
"adalora_orth_reg_weight": 0.5,
|
| 353 |
+
"llamapro_num_new_blocks": 4,
|
| 354 |
+
"llamapro_num_groups": null,
|
| 355 |
+
"lisa_activated_layers": 0,
|
| 356 |
+
"lisa_step_interval": 20,
|
| 357 |
+
"reft_layer_key": null,
|
| 358 |
+
"reft_layers": null,
|
| 359 |
+
"reft_rank": 4,
|
| 360 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 361 |
+
"reft_args": null,
|
| 362 |
+
"swanlab_token": null,
|
| 363 |
+
"swanlab_project": null,
|
| 364 |
+
"swanlab_workspace": null,
|
| 365 |
+
"swanlab_exp_name": null,
|
| 366 |
+
"swanlab_lark_webhook_url": null,
|
| 367 |
+
"swanlab_lark_secret": null,
|
| 368 |
+
"swanlab_mode": "cloud",
|
| 369 |
+
"add_version": true,
|
| 370 |
+
"create_checkpoint_symlink": false,
|
| 371 |
+
"zero_hpz_partition_size": null,
|
| 372 |
+
"deepspeed_autotp_size": null,
|
| 373 |
+
"early_stop_interval": null,
|
| 374 |
+
"rank": 0,
|
| 375 |
+
"global_world_size": 8,
|
| 376 |
+
"local_world_size": 8,
|
| 377 |
+
"model_suffix": "Qwen2.5-7B-Instruct",
|
| 378 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
|
| 379 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding']), ModelGroup(models=[Model(ms_model_id='moonshotai/Kimi-Dev-72B', hf_model_id='moonshotai/Kimi-Dev-72B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f18b0b01ab0>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37'], tags=[])",
|
| 380 |
+
"model_dir": "/root/.cache/modelscope/hub/models/Qwen/Qwen2___5-7B-Instruct",
|
| 381 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 382 |
+
"evaluation_strategy": "epoch",
|
| 383 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=2000.0, dataloader_num_workers=48, dataloader_prefetch_factor=10, past_index=-1, run_name='/group/40143/hongzhuyi/ms-swift/output/v6-20250917-134949', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, channels=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
|
| 384 |
+
}
|
v6-20250917-134949/logging.jsonl
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"loss": 1.27844775, "grad_norm": 25.99517388, "learning_rate": 1.3e-07, "token_acc": 0.73113072, "epoch": 0.00267023, "global_step/max_steps": "1/750", "percentage": "0.13%", "elapsed_time": "22s", "remaining_time": "4h 35m 7s", "memory(GiB)": 25.14, "train_speed(iter/s)": 0.045372}
|
| 2 |
+
{"loss": 1.27462864, "grad_norm": 26.01345823, "learning_rate": 2.6e-07, "token_acc": 0.72517288, "epoch": 0.00534045, "global_step/max_steps": "2/750", "percentage": "0.27%", "elapsed_time": "49s", "remaining_time": "5h 6m 9s", "memory(GiB)": 26.61, "train_speed(iter/s)": 0.040719}
|
| 3 |
+
{"loss": 1.39245546, "grad_norm": 27.78962546, "learning_rate": 3.9e-07, "token_acc": 0.69932902, "epoch": 0.00801068, "global_step/max_steps": "3/750", "percentage": "0.40%", "elapsed_time": "1m 14s", "remaining_time": "5h 7m 12s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.040526}
|
| 4 |
+
{"loss": 1.26101208, "grad_norm": 25.88373791, "learning_rate": 5.3e-07, "token_acc": 0.72536421, "epoch": 0.01068091, "global_step/max_steps": "4/750", "percentage": "0.53%", "elapsed_time": "1m 35s", "remaining_time": "4h 57m 46s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.041755}
|
| 5 |
+
{"loss": 1.30307007, "grad_norm": 27.26872908, "learning_rate": 6.6e-07, "token_acc": 0.72423238, "epoch": 0.01335113, "global_step/max_steps": "5/750", "percentage": "0.67%", "elapsed_time": "1m 55s", "remaining_time": "4h 47m 25s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.043201}
|
| 6 |
+
{"loss": 1.22360718, "grad_norm": 25.55179448, "learning_rate": 7.9e-07, "token_acc": 0.7291103, "epoch": 0.01602136, "global_step/max_steps": "6/750", "percentage": "0.80%", "elapsed_time": "2m 17s", "remaining_time": "4h 45m 3s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.0435}
|
| 7 |
+
{"loss": 1.24122119, "grad_norm": 23.07582775, "learning_rate": 9.2e-07, "token_acc": 0.72044706, "epoch": 0.01869159, "global_step/max_steps": "7/750", "percentage": "0.93%", "elapsed_time": "2m 39s", "remaining_time": "4h 42m 38s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.043812}
|
| 8 |
+
{"loss": 1.14965189, "grad_norm": 22.20997164, "learning_rate": 1.05e-06, "token_acc": 0.72885573, "epoch": 0.02136182, "global_step/max_steps": "8/750", "percentage": "1.07%", "elapsed_time": "3m 0s", "remaining_time": "4h 39m 45s", "memory(GiB)": 31.62, "train_speed(iter/s)": 0.044206}
|
| 9 |
+
{"loss": 1.10636055, "grad_norm": 20.4945993, "learning_rate": 1.18e-06, "token_acc": 0.74439591, "epoch": 0.02403204, "global_step/max_steps": "9/750", "percentage": "1.20%", "elapsed_time": "3m 24s", "remaining_time": "4h 40m 40s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.044002}
|
| 10 |
+
{"loss": 1.02117717, "grad_norm": 13.49092053, "learning_rate": 1.32e-06, "token_acc": 0.7293604, "epoch": 0.02670227, "global_step/max_steps": "10/750", "percentage": "1.33%", "elapsed_time": "3m 51s", "remaining_time": "4h 45m 47s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.043155}
|
| 11 |
+
{"loss": 0.91030097, "grad_norm": 11.63646161, "learning_rate": 1.45e-06, "token_acc": 0.75036925, "epoch": 0.0293725, "global_step/max_steps": "11/750", "percentage": "1.47%", "elapsed_time": "4m 14s", "remaining_time": "4h 45m 7s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.043197}
|
| 12 |
+
{"loss": 0.94025064, "grad_norm": 10.67731611, "learning_rate": 1.58e-06, "token_acc": 0.73384029, "epoch": 0.03204272, "global_step/max_steps": "12/750", "percentage": "1.60%", "elapsed_time": "4m 53s", "remaining_time": "5h 1m 1s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.040859}
|
| 13 |
+
{"loss": 0.88358194, "grad_norm": 8.83214012, "learning_rate": 1.71e-06, "token_acc": 0.74344194, "epoch": 0.03471295, "global_step/max_steps": "13/750", "percentage": "1.73%", "elapsed_time": "5m 18s", "remaining_time": "5h 0m 35s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.040864}
|
| 14 |
+
{"loss": 0.77978659, "grad_norm": 8.46470553, "learning_rate": 1.84e-06, "token_acc": 0.76530904, "epoch": 0.03738318, "global_step/max_steps": "14/750", "percentage": "1.87%", "elapsed_time": "5m 40s", "remaining_time": "4h 58m 38s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041076}
|
| 15 |
+
{"loss": 0.77927828, "grad_norm": 7.27100215, "learning_rate": 1.97e-06, "token_acc": 0.77147579, "epoch": 0.0400534, "global_step/max_steps": "15/750", "percentage": "2.00%", "elapsed_time": "6m 5s", "remaining_time": "4h 58m 52s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.040986}
|
| 16 |
+
{"loss": 0.84693223, "grad_norm": 7.18134719, "learning_rate": 2.11e-06, "token_acc": 0.75283021, "epoch": 0.04272363, "global_step/max_steps": "16/750", "percentage": "2.13%", "elapsed_time": "6m 26s", "remaining_time": "4h 55m 47s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041358}
|
| 17 |
+
{"loss": 0.78003627, "grad_norm": 5.70513852, "learning_rate": 2.24e-06, "token_acc": 0.77384925, "epoch": 0.04539386, "global_step/max_steps": "17/750", "percentage": "2.27%", "elapsed_time": "6m 45s", "remaining_time": "4h 51m 7s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041965}
|
| 18 |
+
{"loss": 0.78369838, "grad_norm": 5.66400262, "learning_rate": 2.37e-06, "token_acc": 0.77144468, "epoch": 0.04806409, "global_step/max_steps": "18/750", "percentage": "2.40%", "elapsed_time": "7m 8s", "remaining_time": "4h 50m 40s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.041972}
|
| 19 |
+
{"loss": 0.71311992, "grad_norm": 5.2084166, "learning_rate": 2.5e-06, "token_acc": 0.78415757, "epoch": 0.05073431, "global_step/max_steps": "19/750", "percentage": "2.53%", "elapsed_time": "7m 30s", "remaining_time": "4h 48m 36s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042214}
|
| 20 |
+
{"loss": 0.67186064, "grad_norm": 4.74874935, "learning_rate": 2.63e-06, "token_acc": 0.80047834, "epoch": 0.05340454, "global_step/max_steps": "20/750", "percentage": "2.67%", "elapsed_time": "7m 55s", "remaining_time": "4h 49m 11s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042071}
|
| 21 |
+
{"loss": 0.69137228, "grad_norm": 4.78263632, "learning_rate": 2.76e-06, "token_acc": 0.79207921, "epoch": 0.05607477, "global_step/max_steps": "21/750", "percentage": "2.80%", "elapsed_time": "8m 17s", "remaining_time": "4h 47m 46s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.04222}
|
| 22 |
+
{"loss": 0.64969599, "grad_norm": 4.49963775, "learning_rate": 2.89e-06, "token_acc": 0.80375814, "epoch": 0.05874499, "global_step/max_steps": "22/750", "percentage": "2.93%", "elapsed_time": "8m 39s", "remaining_time": "4h 46m 23s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042366}
|
| 23 |
+
{"loss": 0.697088, "grad_norm": 4.21276913, "learning_rate": 3.03e-06, "token_acc": 0.78882074, "epoch": 0.06141522, "global_step/max_steps": "23/750", "percentage": "3.07%", "elapsed_time": "9m 3s", "remaining_time": "4h 46m 12s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042335}
|
| 24 |
+
{"loss": 0.66172689, "grad_norm": 4.00953554, "learning_rate": 3.16e-06, "token_acc": 0.7970311, "epoch": 0.06408545, "global_step/max_steps": "24/750", "percentage": "3.20%", "elapsed_time": "9m 26s", "remaining_time": "4h 45m 41s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042352}
|
| 25 |
+
{"loss": 0.665833, "grad_norm": 4.67415643, "learning_rate": 3.29e-06, "token_acc": 0.79373443, "epoch": 0.06675567, "global_step/max_steps": "25/750", "percentage": "3.33%", "elapsed_time": "9m 50s", "remaining_time": "4h 45m 14s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042362}
|
| 26 |
+
{"loss": 0.60853577, "grad_norm": 4.33591965, "learning_rate": 3.42e-06, "token_acc": 0.80888575, "epoch": 0.0694259, "global_step/max_steps": "26/750", "percentage": "3.47%", "elapsed_time": "10m 11s", "remaining_time": "4h 43m 48s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042516}
|
| 27 |
+
{"loss": 0.68158579, "grad_norm": 4.57626334, "learning_rate": 3.55e-06, "token_acc": 0.79410064, "epoch": 0.07209613, "global_step/max_steps": "27/750", "percentage": "3.60%", "elapsed_time": "10m 31s", "remaining_time": "4h 41m 51s", "memory(GiB)": 38.25, "train_speed(iter/s)": 0.042753}
|
| 28 |
+
{"loss": 0.57617843, "grad_norm": 3.84859446, "learning_rate": 3.68e-06, "token_acc": 0.8145172, "epoch": 0.07476636, "global_step/max_steps": "28/750", "percentage": "3.73%", "elapsed_time": "10m 54s", "remaining_time": "4h 41m 13s", "memory(GiB)": 49.15, "train_speed(iter/s)": 0.042789}
|
| 29 |
+
{"loss": 0.58978891, "grad_norm": 3.78972326, "learning_rate": 3.82e-06, "token_acc": 0.81657141, "epoch": 0.07743658, "global_step/max_steps": "29/750", "percentage": "3.87%", "elapsed_time": "11m 16s", "remaining_time": "4h 40m 14s", "memory(GiB)": 49.15, "train_speed(iter/s)": 0.042879}
|
| 30 |
+
{"loss": 0.65006042, "grad_norm": 3.80896471, "learning_rate": 3.95e-06, "token_acc": 0.79703522, "epoch": 0.08010681, "global_step/max_steps": "30/750", "percentage": "4.00%", "elapsed_time": "11m 41s", "remaining_time": "4h 40m 38s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04276}
|
| 31 |
+
{"loss": 0.56551218, "grad_norm": 3.48714797, "learning_rate": 4.08e-06, "token_acc": 0.81890911, "epoch": 0.08277704, "global_step/max_steps": "31/750", "percentage": "4.13%", "elapsed_time": "12m 0s", "remaining_time": "4h 38m 39s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043004}
|
| 32 |
+
{"loss": 0.56460661, "grad_norm": 3.46457404, "learning_rate": 4.21e-06, "token_acc": 0.82258999, "epoch": 0.08544726, "global_step/max_steps": "32/750", "percentage": "4.27%", "elapsed_time": "12m 20s", "remaining_time": "4h 36m 47s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043233}
|
| 33 |
+
{"loss": 0.60303181, "grad_norm": 3.22141669, "learning_rate": 4.34e-06, "token_acc": 0.80737174, "epoch": 0.08811749, "global_step/max_steps": "33/750", "percentage": "4.40%", "elapsed_time": "12m 42s", "remaining_time": "4h 36m 2s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04329}
|
| 34 |
+
{"loss": 0.57607186, "grad_norm": 3.21232272, "learning_rate": 4.47e-06, "token_acc": 0.81398487, "epoch": 0.09078772, "global_step/max_steps": "34/750", "percentage": "4.53%", "elapsed_time": "13m 7s", "remaining_time": "4h 36m 19s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043186}
|
| 35 |
+
{"loss": 0.60476643, "grad_norm": 3.3457971, "learning_rate": 4.61e-06, "token_acc": 0.80856991, "epoch": 0.09345794, "global_step/max_steps": "35/750", "percentage": "4.67%", "elapsed_time": "13m 28s", "remaining_time": "4h 35m 22s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043273}
|
| 36 |
+
{"loss": 0.58224332, "grad_norm": 3.21682624, "learning_rate": 4.74e-06, "token_acc": 0.81014532, "epoch": 0.09612817, "global_step/max_steps": "36/750", "percentage": "4.80%", "elapsed_time": "13m 52s", "remaining_time": "4h 35m 12s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043241}
|
| 37 |
+
{"loss": 0.55732399, "grad_norm": 3.22129243, "learning_rate": 4.87e-06, "token_acc": 0.82355452, "epoch": 0.0987984, "global_step/max_steps": "37/750", "percentage": "4.93%", "elapsed_time": "14m 12s", "remaining_time": "4h 33m 44s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043412}
|
| 38 |
+
{"loss": 0.51289082, "grad_norm": 3.11871556, "learning_rate": 5e-06, "token_acc": 0.82989198, "epoch": 0.10146862, "global_step/max_steps": "38/750", "percentage": "5.07%", "elapsed_time": "14m 37s", "remaining_time": "4h 33m 54s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043324}
|
| 39 |
+
{"loss": 0.55835736, "grad_norm": 3.41305042, "learning_rate": 5e-06, "token_acc": 0.81758821, "epoch": 0.10413885, "global_step/max_steps": "39/750", "percentage": "5.20%", "elapsed_time": "14m 57s", "remaining_time": "4h 32m 38s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043465}
|
| 40 |
+
{"loss": 0.55718553, "grad_norm": 3.10799851, "learning_rate": 5e-06, "token_acc": 0.82248998, "epoch": 0.10680908, "global_step/max_steps": "40/750", "percentage": "5.33%", "elapsed_time": "15m 18s", "remaining_time": "4h 31m 45s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043545}
|
| 41 |
+
{"loss": 0.5457294, "grad_norm": 2.96653442, "learning_rate": 5e-06, "token_acc": 0.83149326, "epoch": 0.10947931, "global_step/max_steps": "41/750", "percentage": "5.47%", "elapsed_time": "15m 41s", "remaining_time": "4h 31m 27s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04353}
|
| 42 |
+
{"loss": 0.60883057, "grad_norm": 3.02415036, "learning_rate": 5e-06, "token_acc": 0.80839103, "epoch": 0.11214953, "global_step/max_steps": "42/750", "percentage": "5.60%", "elapsed_time": "16m 2s", "remaining_time": "4h 30m 26s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043633}
|
| 43 |
+
{"loss": 0.5560565, "grad_norm": 2.80719127, "learning_rate": 5e-06, "token_acc": 0.8238644, "epoch": 0.11481976, "global_step/max_steps": "43/750", "percentage": "5.73%", "elapsed_time": "16m 22s", "remaining_time": "4h 29m 18s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043755}
|
| 44 |
+
{"loss": 0.50780284, "grad_norm": 2.90999056, "learning_rate": 5e-06, "token_acc": 0.83437282, "epoch": 0.11748999, "global_step/max_steps": "44/750", "percentage": "5.87%", "elapsed_time": "16m 44s", "remaining_time": "4h 28m 44s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043785}
|
| 45 |
+
{"loss": 0.49909928, "grad_norm": 2.96779505, "learning_rate": 5e-06, "token_acc": 0.83670205, "epoch": 0.12016021, "global_step/max_steps": "45/750", "percentage": "6.00%", "elapsed_time": "17m 3s", "remaining_time": "4h 27m 21s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.04395}
|
| 46 |
+
{"loss": 0.59060061, "grad_norm": 2.98663031, "learning_rate": 5e-06, "token_acc": 0.81317079, "epoch": 0.12283044, "global_step/max_steps": "46/750", "percentage": "6.13%", "elapsed_time": "17m 28s", "remaining_time": "4h 27m 23s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043879}
|
| 47 |
+
{"loss": 0.50838113, "grad_norm": 2.86363356, "learning_rate": 5e-06, "token_acc": 0.83875698, "epoch": 0.12550067, "global_step/max_steps": "47/750", "percentage": "6.27%", "elapsed_time": "17m 53s", "remaining_time": "4h 27m 33s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043791}
|
| 48 |
+
{"loss": 0.4888871, "grad_norm": 2.85946941, "learning_rate": 5e-06, "token_acc": 0.8355937, "epoch": 0.12817089, "global_step/max_steps": "48/750", "percentage": "6.40%", "elapsed_time": "18m 13s", "remaining_time": "4h 26m 39s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043876}
|
| 49 |
+
{"loss": 0.54882491, "grad_norm": 2.9982353, "learning_rate": 5e-06, "token_acc": 0.82041854, "epoch": 0.13084112, "global_step/max_steps": "49/750", "percentage": "6.53%", "elapsed_time": "18m 43s", "remaining_time": "4h 27m 54s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043609}
|
| 50 |
+
{"loss": 0.50211453, "grad_norm": 2.84865014, "learning_rate": 5e-06, "token_acc": 0.83424717, "epoch": 0.13351135, "global_step/max_steps": "50/750", "percentage": "6.67%", "elapsed_time": "19m 3s", "remaining_time": "4h 26m 50s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043722}
|
| 51 |
+
{"loss": 0.48680127, "grad_norm": 2.95464195, "learning_rate": 5e-06, "token_acc": 0.84170783, "epoch": 0.13618158, "global_step/max_steps": "51/750", "percentage": "6.80%", "elapsed_time": "19m 26s", "remaining_time": "4h 26m 31s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043711}
|
| 52 |
+
{"loss": 0.55014861, "grad_norm": 2.775706, "learning_rate": 5e-06, "token_acc": 0.8193754, "epoch": 0.1388518, "global_step/max_steps": "52/750", "percentage": "6.93%", "elapsed_time": "19m 49s", "remaining_time": "4h 26m 13s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043698}
|
| 53 |
+
{"loss": 0.56171989, "grad_norm": 2.8737905, "learning_rate": 4.99e-06, "token_acc": 0.82004064, "epoch": 0.14152203, "global_step/max_steps": "53/750", "percentage": "7.07%", "elapsed_time": "20m 10s", "remaining_time": "4h 25m 21s", "memory(GiB)": 61.71, "train_speed(iter/s)": 0.043778}
|
| 54 |
+
{"loss": 0.45519453, "grad_norm": 2.8556276, "learning_rate": 4.99e-06, "token_acc": 0.8451218, "epoch": 0.14419226, "global_step/max_steps": "54/750", "percentage": "7.20%", "elapsed_time": "20m 34s", "remaining_time": "4h 25m 9s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043747}
|
| 55 |
+
{"loss": 0.51091546, "grad_norm": 2.99209119, "learning_rate": 4.99e-06, "token_acc": 0.8289665, "epoch": 0.14686248, "global_step/max_steps": "55/750", "percentage": "7.33%", "elapsed_time": "20m 52s", "remaining_time": "4h 23m 42s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043926}
|
| 56 |
+
{"loss": 0.48474944, "grad_norm": 2.78579951, "learning_rate": 4.99e-06, "token_acc": 0.83626682, "epoch": 0.14953271, "global_step/max_steps": "56/750", "percentage": "7.47%", "elapsed_time": "21m 10s", "remaining_time": "4h 22m 29s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.044066}
|
| 57 |
+
{"loss": 0.4932918, "grad_norm": 2.65078186, "learning_rate": 4.99e-06, "token_acc": 0.83660716, "epoch": 0.15220294, "global_step/max_steps": "57/750", "percentage": "7.60%", "elapsed_time": "21m 48s", "remaining_time": "4h 25m 6s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043567}
|
| 58 |
+
{"loss": 0.50794923, "grad_norm": 3.02283834, "learning_rate": 4.99e-06, "token_acc": 0.83110255, "epoch": 0.15487316, "global_step/max_steps": "58/750", "percentage": "7.73%", "elapsed_time": "22m 12s", "remaining_time": "4h 25m 0s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04352}
|
| 59 |
+
{"loss": 0.49793923, "grad_norm": 2.69818223, "learning_rate": 4.99e-06, "token_acc": 0.83278316, "epoch": 0.15754339, "global_step/max_steps": "59/750", "percentage": "7.87%", "elapsed_time": "22m 33s", "remaining_time": "4h 24m 12s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043589}
|
| 60 |
+
{"loss": 0.54689407, "grad_norm": 2.97761434, "learning_rate": 4.99e-06, "token_acc": 0.8215152, "epoch": 0.16021362, "global_step/max_steps": "60/750", "percentage": "8.00%", "elapsed_time": "22m 56s", "remaining_time": "4h 23m 45s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.0436}
|
| 61 |
+
{"loss": 0.47106403, "grad_norm": 3.09528933, "learning_rate": 4.99e-06, "token_acc": 0.8421275, "epoch": 0.16288385, "global_step/max_steps": "61/750", "percentage": "8.13%", "elapsed_time": "23m 18s", "remaining_time": "4h 23m 11s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043632}
|
| 62 |
+
{"loss": 0.51365429, "grad_norm": 2.91650805, "learning_rate": 4.99e-06, "token_acc": 0.83015621, "epoch": 0.16555407, "global_step/max_steps": "62/750", "percentage": "8.27%", "elapsed_time": "23m 43s", "remaining_time": "4h 23m 17s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043551}
|
| 63 |
+
{"loss": 0.50668406, "grad_norm": 2.8659642, "learning_rate": 4.98e-06, "token_acc": 0.83339614, "epoch": 0.1682243, "global_step/max_steps": "63/750", "percentage": "8.40%", "elapsed_time": "24m 19s", "remaining_time": "4h 25m 18s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043157}
|
| 64 |
+
{"loss": 0.48768979, "grad_norm": 2.62879961, "learning_rate": 4.98e-06, "token_acc": 0.84143567, "epoch": 0.17089453, "global_step/max_steps": "64/750", "percentage": "8.53%", "elapsed_time": "24m 49s", "remaining_time": "4h 26m 0s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042982}
|
| 65 |
+
{"loss": 0.47808069, "grad_norm": 2.7667854, "learning_rate": 4.98e-06, "token_acc": 0.84517378, "epoch": 0.17356475, "global_step/max_steps": "65/750", "percentage": "8.67%", "elapsed_time": "25m 7s", "remaining_time": "4h 24m 45s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043122}
|
| 66 |
+
{"loss": 0.51385736, "grad_norm": 2.66850491, "learning_rate": 4.98e-06, "token_acc": 0.8314994, "epoch": 0.17623498, "global_step/max_steps": "66/750", "percentage": "8.80%", "elapsed_time": "25m 28s", "remaining_time": "4h 24m 5s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043167}
|
| 67 |
+
{"loss": 0.46788478, "grad_norm": 2.90791281, "learning_rate": 4.98e-06, "token_acc": 0.84539282, "epoch": 0.17890521, "global_step/max_steps": "67/750", "percentage": "8.93%", "elapsed_time": "25m 51s", "remaining_time": "4h 23m 32s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043193}
|
| 68 |
+
{"loss": 0.49202442, "grad_norm": 2.85655592, "learning_rate": 4.98e-06, "token_acc": 0.84020692, "epoch": 0.18157543, "global_step/max_steps": "68/750", "percentage": "9.07%", "elapsed_time": "26m 15s", "remaining_time": "4h 23m 19s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043165}
|
| 69 |
+
{"loss": 0.49528018, "grad_norm": 2.83720613, "learning_rate": 4.98e-06, "token_acc": 0.83596516, "epoch": 0.18424566, "global_step/max_steps": "69/750", "percentage": "9.20%", "elapsed_time": "26m 36s", "remaining_time": "4h 22m 36s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043219}
|
| 70 |
+
{"loss": 0.49208367, "grad_norm": 2.76896128, "learning_rate": 4.98e-06, "token_acc": 0.8359322, "epoch": 0.18691589, "global_step/max_steps": "70/750", "percentage": "9.33%", "elapsed_time": "26m 58s", "remaining_time": "4h 21m 59s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043258}
|
| 71 |
+
{"loss": 0.5072571, "grad_norm": 2.68663506, "learning_rate": 4.97e-06, "token_acc": 0.83775103, "epoch": 0.18958611, "global_step/max_steps": "71/750", "percentage": "9.47%", "elapsed_time": "27m 37s", "remaining_time": "4h 24m 13s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04283}
|
| 72 |
+
{"loss": 0.4340511, "grad_norm": 2.47670799, "learning_rate": 4.97e-06, "token_acc": 0.85182673, "epoch": 0.19225634, "global_step/max_steps": "72/750", "percentage": "9.60%", "elapsed_time": "27m 59s", "remaining_time": "4h 23m 39s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042857}
|
| 73 |
+
{"loss": 0.42714188, "grad_norm": 2.57520246, "learning_rate": 4.97e-06, "token_acc": 0.85724175, "epoch": 0.19492657, "global_step/max_steps": "73/750", "percentage": "9.73%", "elapsed_time": "28m 19s", "remaining_time": "4h 22m 42s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042951}
|
| 74 |
+
{"loss": 0.50578225, "grad_norm": 2.99044505, "learning_rate": 4.97e-06, "token_acc": 0.83729088, "epoch": 0.1975968, "global_step/max_steps": "74/750", "percentage": "9.87%", "elapsed_time": "28m 44s", "remaining_time": "4h 22m 35s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042906}
|
| 75 |
+
{"loss": 0.51055467, "grad_norm": 3.09458818, "learning_rate": 4.97e-06, "token_acc": 0.82715166, "epoch": 0.20026702, "global_step/max_steps": "75/750", "percentage": "10.00%", "elapsed_time": "29m 7s", "remaining_time": "4h 22m 4s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042927}
|
| 76 |
+
{"loss": 0.45781508, "grad_norm": 2.63747937, "learning_rate": 4.96e-06, "token_acc": 0.85162896, "epoch": 0.20293725, "global_step/max_steps": "76/750", "percentage": "10.13%", "elapsed_time": "29m 34s", "remaining_time": "4h 22m 18s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042826}
|
| 77 |
+
{"loss": 0.4840948, "grad_norm": 2.71359866, "learning_rate": 4.96e-06, "token_acc": 0.83874184, "epoch": 0.20560748, "global_step/max_steps": "77/750", "percentage": "10.27%", "elapsed_time": "29m 59s", "remaining_time": "4h 22m 8s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042789}
|
| 78 |
+
{"loss": 0.45613506, "grad_norm": 2.46367457, "learning_rate": 4.96e-06, "token_acc": 0.84933531, "epoch": 0.2082777, "global_step/max_steps": "78/750", "percentage": "10.40%", "elapsed_time": "30m 23s", "remaining_time": "4h 21m 46s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042784}
|
| 79 |
+
{"loss": 0.47574463, "grad_norm": 2.68293019, "learning_rate": 4.96e-06, "token_acc": 0.84153438, "epoch": 0.21094793, "global_step/max_steps": "79/750", "percentage": "10.53%", "elapsed_time": "30m 48s", "remaining_time": "4h 21m 37s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042745}
|
| 80 |
+
{"loss": 0.52936327, "grad_norm": 2.77941479, "learning_rate": 4.96e-06, "token_acc": 0.82768434, "epoch": 0.21361816, "global_step/max_steps": "80/750", "percentage": "10.67%", "elapsed_time": "31m 11s", "remaining_time": "4h 21m 16s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04274}
|
| 81 |
+
{"loss": 0.4498612, "grad_norm": 2.49701486, "learning_rate": 4.96e-06, "token_acc": 0.84986889, "epoch": 0.21628838, "global_step/max_steps": "81/750", "percentage": "10.80%", "elapsed_time": "31m 33s", "remaining_time": "4h 20m 42s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042768}
|
| 82 |
+
{"loss": 0.45617718, "grad_norm": 2.66215104, "learning_rate": 4.95e-06, "token_acc": 0.84527934, "epoch": 0.21895861, "global_step/max_steps": "82/750", "percentage": "10.93%", "elapsed_time": "31m 55s", "remaining_time": "4h 20m 5s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042807}
|
| 83 |
+
{"loss": 0.49068668, "grad_norm": 2.83164211, "learning_rate": 4.95e-06, "token_acc": 0.83319885, "epoch": 0.22162884, "global_step/max_steps": "83/750", "percentage": "11.07%", "elapsed_time": "32m 16s", "remaining_time": "4h 19m 25s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042851}
|
| 84 |
+
{"loss": 0.47080141, "grad_norm": 2.62070172, "learning_rate": 4.95e-06, "token_acc": 0.84347826, "epoch": 0.22429907, "global_step/max_steps": "84/750", "percentage": "11.20%", "elapsed_time": "32m 38s", "remaining_time": "4h 18m 50s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042884}
|
| 85 |
+
{"loss": 0.53563607, "grad_norm": 2.9430949, "learning_rate": 4.95e-06, "token_acc": 0.82353771, "epoch": 0.22696929, "global_step/max_steps": "85/750", "percentage": "11.33%", "elapsed_time": "33m 2s", "remaining_time": "4h 18m 28s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042879}
|
| 86 |
+
{"loss": 0.47248948, "grad_norm": 2.86898516, "learning_rate": 4.94e-06, "token_acc": 0.84338623, "epoch": 0.22963952, "global_step/max_steps": "86/750", "percentage": "11.47%", "elapsed_time": "33m 26s", "remaining_time": "4h 18m 9s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042867}
|
| 87 |
+
{"loss": 0.46606526, "grad_norm": 2.56997683, "learning_rate": 4.94e-06, "token_acc": 0.84610194, "epoch": 0.23230975, "global_step/max_steps": "87/750", "percentage": "11.60%", "elapsed_time": "33m 49s", "remaining_time": "4h 17m 43s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042875}
|
| 88 |
+
{"loss": 0.50277519, "grad_norm": 2.7458057, "learning_rate": 4.94e-06, "token_acc": 0.83897567, "epoch": 0.23497997, "global_step/max_steps": "88/750", "percentage": "11.73%", "elapsed_time": "34m 10s", "remaining_time": "4h 17m 9s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042906}
|
| 89 |
+
{"loss": 0.4936814, "grad_norm": 2.67654973, "learning_rate": 4.94e-06, "token_acc": 0.83582497, "epoch": 0.2376502, "global_step/max_steps": "89/750", "percentage": "11.87%", "elapsed_time": "34m 33s", "remaining_time": "4h 16m 38s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042926}
|
| 90 |
+
{"loss": 0.49069792, "grad_norm": 2.65997853, "learning_rate": 4.93e-06, "token_acc": 0.83801562, "epoch": 0.24032043, "global_step/max_steps": "90/750", "percentage": "12.00%", "elapsed_time": "34m 55s", "remaining_time": "4h 16m 7s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042948}
|
| 91 |
+
{"loss": 0.5154832, "grad_norm": 2.66841192, "learning_rate": 4.93e-06, "token_acc": 0.8292045, "epoch": 0.24299065, "global_step/max_steps": "91/750", "percentage": "12.13%", "elapsed_time": "35m 18s", "remaining_time": "4h 15m 40s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042958}
|
| 92 |
+
{"loss": 0.4774642, "grad_norm": 2.76984675, "learning_rate": 4.93e-06, "token_acc": 0.84101444, "epoch": 0.24566088, "global_step/max_steps": "92/750", "percentage": "12.27%", "elapsed_time": "35m 43s", "remaining_time": "4h 15m 33s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042914}
|
| 93 |
+
{"loss": 0.51443124, "grad_norm": 2.6295351, "learning_rate": 4.93e-06, "token_acc": 0.83289403, "epoch": 0.24833111, "global_step/max_steps": "93/750", "percentage": "12.40%", "elapsed_time": "36m 8s", "remaining_time": "4h 15m 19s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042886}
|
| 94 |
+
{"loss": 0.48028237, "grad_norm": 2.73234039, "learning_rate": 4.92e-06, "token_acc": 0.84037942, "epoch": 0.25100134, "global_step/max_steps": "94/750", "percentage": "12.53%", "elapsed_time": "36m 27s", "remaining_time": "4h 14m 23s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042978}
|
| 95 |
+
{"loss": 0.46484447, "grad_norm": 2.4100259, "learning_rate": 4.92e-06, "token_acc": 0.84531331, "epoch": 0.25367156, "global_step/max_steps": "95/750", "percentage": "12.67%", "elapsed_time": "36m 47s", "remaining_time": "4h 13m 37s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043041}
|
| 96 |
+
{"loss": 0.42639616, "grad_norm": 2.531642, "learning_rate": 4.92e-06, "token_acc": 0.85547274, "epoch": 0.25634179, "global_step/max_steps": "96/750", "percentage": "12.80%", "elapsed_time": "37m 10s", "remaining_time": "4h 13m 17s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043033}
|
| 97 |
+
{"loss": 0.43437934, "grad_norm": 2.71661515, "learning_rate": 4.92e-06, "token_acc": 0.852817, "epoch": 0.25901202, "global_step/max_steps": "97/750", "percentage": "12.93%", "elapsed_time": "37m 32s", "remaining_time": "4h 12m 40s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043071}
|
| 98 |
+
{"loss": 0.46744508, "grad_norm": 2.69671582, "learning_rate": 4.91e-06, "token_acc": 0.84636527, "epoch": 0.26168224, "global_step/max_steps": "98/750", "percentage": "13.07%", "elapsed_time": "37m 54s", "remaining_time": "4h 12m 11s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.04309}
|
| 99 |
+
{"loss": 0.45774141, "grad_norm": 2.97757487, "learning_rate": 4.91e-06, "token_acc": 0.85031039, "epoch": 0.26435247, "global_step/max_steps": "99/750", "percentage": "13.20%", "elapsed_time": "38m 15s", "remaining_time": "4h 11m 35s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043126}
|
| 100 |
+
{"loss": 0.45865607, "grad_norm": 2.51920976, "learning_rate": 4.91e-06, "token_acc": 0.85102785, "epoch": 0.2670227, "global_step/max_steps": "100/750", "percentage": "13.33%", "elapsed_time": "38m 40s", "remaining_time": "4h 11m 21s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.0431}
|
| 101 |
+
{"loss": 0.47016156, "grad_norm": 2.781076, "learning_rate": 4.9e-06, "token_acc": 0.84529811, "epoch": 0.26969292, "global_step/max_steps": "101/750", "percentage": "13.47%", "elapsed_time": "39m 9s", "remaining_time": "4h 11m 39s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.042981}
|
| 102 |
+
{"loss": 0.47470731, "grad_norm": 2.96908552, "learning_rate": 4.9e-06, "token_acc": 0.83988202, "epoch": 0.27236315, "global_step/max_steps": "102/750", "percentage": "13.60%", "elapsed_time": "39m 30s", "remaining_time": "4h 11m 0s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043026}
|
| 103 |
+
{"loss": 0.45952839, "grad_norm": 2.77982348, "learning_rate": 4.9e-06, "token_acc": 0.84351736, "epoch": 0.27503338, "global_step/max_steps": "103/750", "percentage": "13.73%", "elapsed_time": "39m 50s", "remaining_time": "4h 10m 16s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043086}
|
| 104 |
+
{"loss": 0.46661314, "grad_norm": 2.64092945, "learning_rate": 4.89e-06, "token_acc": 0.84133554, "epoch": 0.2777036, "global_step/max_steps": "104/750", "percentage": "13.87%", "elapsed_time": "40m 12s", "remaining_time": "4h 9m 43s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043114}
|
| 105 |
+
{"loss": 0.47708511, "grad_norm": 2.66905977, "learning_rate": 4.89e-06, "token_acc": 0.84014326, "epoch": 0.28037383, "global_step/max_steps": "105/750", "percentage": "14.00%", "elapsed_time": "40m 34s", "remaining_time": "4h 9m 14s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043131}
|
| 106 |
+
{"loss": 0.45809263, "grad_norm": 2.5572017, "learning_rate": 4.89e-06, "token_acc": 0.84823924, "epoch": 0.28304406, "global_step/max_steps": "106/750", "percentage": "14.13%", "elapsed_time": "40m 59s", "remaining_time": "4h 9m 2s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043099}
|
| 107 |
+
{"loss": 0.4886952, "grad_norm": 2.58389633, "learning_rate": 4.89e-06, "token_acc": 0.83895445, "epoch": 0.28571429, "global_step/max_steps": "107/750", "percentage": "14.27%", "elapsed_time": "41m 26s", "remaining_time": "4h 9m 2s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043031}
|
| 108 |
+
{"loss": 0.45709381, "grad_norm": 2.55273775, "learning_rate": 4.88e-06, "token_acc": 0.85046595, "epoch": 0.28838451, "global_step/max_steps": "108/750", "percentage": "14.40%", "elapsed_time": "41m 45s", "remaining_time": "4h 8m 13s", "memory(GiB)": 78.32, "train_speed(iter/s)": 0.043107}
|
| 109 |
+
{"loss": 0.53926206, "grad_norm": 2.74545125, "learning_rate": 4.88e-06, "token_acc": 0.82542646, "epoch": 0.29105474, "global_step/max_steps": "109/750", "percentage": "14.53%", "elapsed_time": "42m 12s", "remaining_time": "4h 8m 14s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043036}
|
| 110 |
+
{"loss": 0.45649612, "grad_norm": 2.86313064, "learning_rate": 4.87e-06, "token_acc": 0.84482259, "epoch": 0.29372497, "global_step/max_steps": "110/750", "percentage": "14.67%", "elapsed_time": "42m 34s", "remaining_time": "4h 7m 40s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043068}
|
| 111 |
+
{"loss": 0.47792593, "grad_norm": 2.66411764, "learning_rate": 4.87e-06, "token_acc": 0.84172088, "epoch": 0.29639519, "global_step/max_steps": "111/750", "percentage": "14.80%", "elapsed_time": "42m 56s", "remaining_time": "4h 7m 13s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043077}
|
| 112 |
+
{"loss": 0.41383034, "grad_norm": 2.47914691, "learning_rate": 4.87e-06, "token_acc": 0.86086959, "epoch": 0.29906542, "global_step/max_steps": "112/750", "percentage": "14.93%", "elapsed_time": "43m 21s", "remaining_time": "4h 6m 58s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043054}
|
| 113 |
+
{"loss": 0.48211718, "grad_norm": 2.51735911, "learning_rate": 4.86e-06, "token_acc": 0.83951277, "epoch": 0.30173565, "global_step/max_steps": "113/750", "percentage": "15.07%", "elapsed_time": "43m 45s", "remaining_time": "4h 6m 41s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043035}
|
| 114 |
+
{"loss": 0.47428066, "grad_norm": 2.71498641, "learning_rate": 4.86e-06, "token_acc": 0.84289944, "epoch": 0.30440587, "global_step/max_steps": "114/750", "percentage": "15.20%", "elapsed_time": "44m 5s", "remaining_time": "4h 5m 59s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04309}
|
| 115 |
+
{"loss": 0.46621677, "grad_norm": 2.80723392, "learning_rate": 4.86e-06, "token_acc": 0.84289783, "epoch": 0.3070761, "global_step/max_steps": "115/750", "percentage": "15.33%", "elapsed_time": "44m 27s", "remaining_time": "4h 5m 30s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043109}
|
| 116 |
+
{"loss": 0.44936496, "grad_norm": 2.4883186, "learning_rate": 4.85e-06, "token_acc": 0.84700274, "epoch": 0.30974633, "global_step/max_steps": "116/750", "percentage": "15.47%", "elapsed_time": "44m 50s", "remaining_time": "4h 5m 3s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043118}
|
| 117 |
+
{"loss": 0.50375044, "grad_norm": 2.64161239, "learning_rate": 4.85e-06, "token_acc": 0.82930297, "epoch": 0.31241656, "global_step/max_steps": "117/750", "percentage": "15.60%", "elapsed_time": "45m 12s", "remaining_time": "4h 4m 34s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043136}
|
| 118 |
+
{"loss": 0.42930186, "grad_norm": 2.71646414, "learning_rate": 4.85e-06, "token_acc": 0.85322839, "epoch": 0.31508678, "global_step/max_steps": "118/750", "percentage": "15.73%", "elapsed_time": "45m 36s", "remaining_time": "4h 4m 18s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043116}
|
| 119 |
+
{"loss": 0.45925462, "grad_norm": 2.5869505, "learning_rate": 4.84e-06, "token_acc": 0.84819764, "epoch": 0.31775701, "global_step/max_steps": "119/750", "percentage": "15.87%", "elapsed_time": "46m 1s", "remaining_time": "4h 4m 3s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04309}
|
| 120 |
+
{"loss": 0.46710986, "grad_norm": 2.61488251, "learning_rate": 4.84e-06, "token_acc": 0.84349996, "epoch": 0.32042724, "global_step/max_steps": "120/750", "percentage": "16.00%", "elapsed_time": "46m 27s", "remaining_time": "4h 3m 52s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043054}
|
| 121 |
+
{"loss": 0.46162307, "grad_norm": 2.60637056, "learning_rate": 4.83e-06, "token_acc": 0.84564304, "epoch": 0.32309746, "global_step/max_steps": "121/750", "percentage": "16.13%", "elapsed_time": "46m 48s", "remaining_time": "4h 3m 20s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043082}
|
| 122 |
+
{"loss": 0.48382646, "grad_norm": 2.68137678, "learning_rate": 4.83e-06, "token_acc": 0.83610612, "epoch": 0.32576769, "global_step/max_steps": "122/750", "percentage": "16.27%", "elapsed_time": "47m 8s", "remaining_time": "4h 2m 40s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04313}
|
| 123 |
+
{"loss": 0.44379669, "grad_norm": 2.60598634, "learning_rate": 4.83e-06, "token_acc": 0.85078573, "epoch": 0.32843792, "global_step/max_steps": "123/750", "percentage": "16.40%", "elapsed_time": "47m 32s", "remaining_time": "4h 2m 21s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043118}
|
| 124 |
+
{"loss": 0.47259939, "grad_norm": 2.57065172, "learning_rate": 4.82e-06, "token_acc": 0.83720303, "epoch": 0.33110814, "global_step/max_steps": "124/750", "percentage": "16.53%", "elapsed_time": "47m 52s", "remaining_time": "4h 1m 41s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043167}
|
| 125 |
+
{"loss": 0.43784583, "grad_norm": 2.66877627, "learning_rate": 4.82e-06, "token_acc": 0.85328507, "epoch": 0.33377837, "global_step/max_steps": "125/750", "percentage": "16.67%", "elapsed_time": "48m 13s", "remaining_time": "4h 1m 5s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043205}
|
| 126 |
+
{"loss": 0.49782473, "grad_norm": 2.68240806, "learning_rate": 4.81e-06, "token_acc": 0.83495653, "epoch": 0.3364486, "global_step/max_steps": "126/750", "percentage": "16.80%", "elapsed_time": "48m 32s", "remaining_time": "4h 0m 25s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043256}
|
| 127 |
+
{"loss": 0.45435789, "grad_norm": 2.76874097, "learning_rate": 4.81e-06, "token_acc": 0.84716034, "epoch": 0.33911883, "global_step/max_steps": "127/750", "percentage": "16.93%", "elapsed_time": "48m 55s", "remaining_time": "4h 0m 0s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043261}
|
| 128 |
+
{"loss": 0.44461328, "grad_norm": 2.92010891, "learning_rate": 4.81e-06, "token_acc": 0.8496424, "epoch": 0.34178905, "global_step/max_steps": "128/750", "percentage": "17.07%", "elapsed_time": "49m 13s", "remaining_time": "3h 59m 12s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043336}
|
| 129 |
+
{"loss": 0.4642204, "grad_norm": 2.76921943, "learning_rate": 4.8e-06, "token_acc": 0.84203398, "epoch": 0.34445928, "global_step/max_steps": "129/750", "percentage": "17.20%", "elapsed_time": "49m 34s", "remaining_time": "3h 58m 37s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043373}
|
| 130 |
+
{"loss": 0.45947498, "grad_norm": 2.47558522, "learning_rate": 4.8e-06, "token_acc": 0.84888834, "epoch": 0.34712951, "global_step/max_steps": "130/750", "percentage": "17.33%", "elapsed_time": "49m 54s", "remaining_time": "3h 58m 0s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043417}
|
| 131 |
+
{"loss": 0.50250471, "grad_norm": 2.6805787, "learning_rate": 4.79e-06, "token_acc": 0.82905984, "epoch": 0.34979973, "global_step/max_steps": "131/750", "percentage": "17.47%", "elapsed_time": "50m 14s", "remaining_time": "3h 57m 25s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043451}
|
| 132 |
+
{"loss": 0.48010272, "grad_norm": 2.7863942, "learning_rate": 4.79e-06, "token_acc": 0.83882928, "epoch": 0.35246996, "global_step/max_steps": "132/750", "percentage": "17.60%", "elapsed_time": "50m 38s", "remaining_time": "3h 57m 5s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043444}
|
| 133 |
+
{"loss": 0.44011635, "grad_norm": 2.48276467, "learning_rate": 4.78e-06, "token_acc": 0.85332054, "epoch": 0.35514019, "global_step/max_steps": "133/750", "percentage": "17.73%", "elapsed_time": "51m 3s", "remaining_time": "3h 56m 49s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043421}
|
| 134 |
+
{"loss": 0.46361014, "grad_norm": 2.67930095, "learning_rate": 4.78e-06, "token_acc": 0.84670627, "epoch": 0.35781041, "global_step/max_steps": "134/750", "percentage": "17.87%", "elapsed_time": "51m 26s", "remaining_time": "3h 56m 30s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043409}
|
| 135 |
+
{"loss": 0.45936275, "grad_norm": 2.46022723, "learning_rate": 4.77e-06, "token_acc": 0.84774953, "epoch": 0.36048064, "global_step/max_steps": "135/750", "percentage": "18.00%", "elapsed_time": "51m 51s", "remaining_time": "3h 56m 12s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043394}
|
| 136 |
+
{"loss": 0.44871199, "grad_norm": 2.73454051, "learning_rate": 4.77e-06, "token_acc": 0.85209233, "epoch": 0.36315087, "global_step/max_steps": "136/750", "percentage": "18.13%", "elapsed_time": "52m 13s", "remaining_time": "3h 55m 48s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043397}
|
| 137 |
+
{"loss": 0.44954139, "grad_norm": 2.58445544, "learning_rate": 4.77e-06, "token_acc": 0.85186768, "epoch": 0.36582109, "global_step/max_steps": "137/750", "percentage": "18.27%", "elapsed_time": "52m 33s", "remaining_time": "3h 55m 11s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04344}
|
| 138 |
+
{"loss": 0.42780191, "grad_norm": 2.5114116, "learning_rate": 4.76e-06, "token_acc": 0.85548055, "epoch": 0.36849132, "global_step/max_steps": "138/750", "percentage": "18.40%", "elapsed_time": "52m 55s", "remaining_time": "3h 54m 43s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043454}
|
| 139 |
+
{"loss": 0.42587936, "grad_norm": 2.4745295, "learning_rate": 4.76e-06, "token_acc": 0.85465199, "epoch": 0.37116155, "global_step/max_steps": "139/750", "percentage": "18.53%", "elapsed_time": "53m 15s", "remaining_time": "3h 54m 6s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043497}
|
| 140 |
+
{"loss": 0.45688677, "grad_norm": 2.5234827, "learning_rate": 4.75e-06, "token_acc": 0.84741819, "epoch": 0.37383178, "global_step/max_steps": "140/750", "percentage": "18.67%", "elapsed_time": "53m 36s", "remaining_time": "3h 53m 33s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043529}
|
| 141 |
+
{"loss": 0.45167503, "grad_norm": 2.60895767, "learning_rate": 4.75e-06, "token_acc": 0.85042483, "epoch": 0.376502, "global_step/max_steps": "141/750", "percentage": "18.80%", "elapsed_time": "54m 0s", "remaining_time": "3h 53m 15s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043513}
|
| 142 |
+
{"loss": 0.51007497, "grad_norm": 2.75437796, "learning_rate": 4.74e-06, "token_acc": 0.82781076, "epoch": 0.37917223, "global_step/max_steps": "142/750", "percentage": "18.93%", "elapsed_time": "54m 23s", "remaining_time": "3h 52m 51s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043518}
|
| 143 |
+
{"loss": 0.41679853, "grad_norm": 2.31693874, "learning_rate": 4.74e-06, "token_acc": 0.86460167, "epoch": 0.38184246, "global_step/max_steps": "143/750", "percentage": "19.07%", "elapsed_time": "54m 41s", "remaining_time": "3h 52m 9s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043577}
|
| 144 |
+
{"loss": 0.42220882, "grad_norm": 2.46720865, "learning_rate": 4.73e-06, "token_acc": 0.86303991, "epoch": 0.38451268, "global_step/max_steps": "144/750", "percentage": "19.20%", "elapsed_time": "55m 3s", "remaining_time": "3h 51m 43s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043588}
|
| 145 |
+
{"loss": 0.471122, "grad_norm": 2.47993604, "learning_rate": 4.73e-06, "token_acc": 0.84121382, "epoch": 0.38718291, "global_step/max_steps": "145/750", "percentage": "19.33%", "elapsed_time": "55m 23s", "remaining_time": "3h 51m 8s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043624}
|
| 146 |
+
{"loss": 0.45192125, "grad_norm": 2.60728876, "learning_rate": 4.72e-06, "token_acc": 0.8470009, "epoch": 0.38985314, "global_step/max_steps": "146/750", "percentage": "19.47%", "elapsed_time": "55m 48s", "remaining_time": "3h 50m 51s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043604}
|
| 147 |
+
{"loss": 0.47766036, "grad_norm": 2.42901845, "learning_rate": 4.72e-06, "token_acc": 0.83968115, "epoch": 0.39252336, "global_step/max_steps": "147/750", "percentage": "19.60%", "elapsed_time": "56m 12s", "remaining_time": "3h 50m 32s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043593}
|
| 148 |
+
{"loss": 0.44105217, "grad_norm": 2.3236883, "learning_rate": 4.71e-06, "token_acc": 0.85267359, "epoch": 0.39519359, "global_step/max_steps": "148/750", "percentage": "19.73%", "elapsed_time": "56m 32s", "remaining_time": "3h 50m 0s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043621}
|
| 149 |
+
{"loss": 0.46359736, "grad_norm": 2.51582729, "learning_rate": 4.71e-06, "token_acc": 0.84363395, "epoch": 0.39786382, "global_step/max_steps": "149/750", "percentage": "19.87%", "elapsed_time": "57m 2s", "remaining_time": "3h 50m 3s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04354}
|
| 150 |
+
{"loss": 0.45162177, "grad_norm": 2.4586073, "learning_rate": 4.7e-06, "token_acc": 0.85317409, "epoch": 0.40053405, "global_step/max_steps": "150/750", "percentage": "20.00%", "elapsed_time": "57m 21s", "remaining_time": "3h 49m 27s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04358}
|
| 151 |
+
{"loss": 0.46587312, "grad_norm": 2.46946774, "learning_rate": 4.7e-06, "token_acc": 0.8441934, "epoch": 0.40320427, "global_step/max_steps": "151/750", "percentage": "20.13%", "elapsed_time": "57m 46s", "remaining_time": "3h 49m 10s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043562}
|
| 152 |
+
{"loss": 0.44039536, "grad_norm": 2.5210355, "learning_rate": 4.69e-06, "token_acc": 0.85168988, "epoch": 0.4058745, "global_step/max_steps": "152/750", "percentage": "20.27%", "elapsed_time": "58m 8s", "remaining_time": "3h 48m 42s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043577}
|
| 153 |
+
{"loss": 0.38906497, "grad_norm": 2.28278218, "learning_rate": 4.69e-06, "token_acc": 0.86812884, "epoch": 0.40854473, "global_step/max_steps": "153/750", "percentage": "20.40%", "elapsed_time": "58m 32s", "remaining_time": "3h 48m 26s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043556}
|
| 154 |
+
{"loss": 0.48725152, "grad_norm": 2.79372161, "learning_rate": 4.68e-06, "token_acc": 0.83746082, "epoch": 0.41121495, "global_step/max_steps": "154/750", "percentage": "20.53%", "elapsed_time": "58m 54s", "remaining_time": "3h 47m 59s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043568}
|
| 155 |
+
{"loss": 0.50707674, "grad_norm": 2.663143, "learning_rate": 4.67e-06, "token_acc": 0.83335632, "epoch": 0.41388518, "global_step/max_steps": "155/750", "percentage": "20.67%", "elapsed_time": "59m 22s", "remaining_time": "3h 47m 53s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043515}
|
| 156 |
+
{"loss": 0.46514195, "grad_norm": 2.49969478, "learning_rate": 4.67e-06, "token_acc": 0.8445397, "epoch": 0.41655541, "global_step/max_steps": "156/750", "percentage": "20.80%", "elapsed_time": "59m 42s", "remaining_time": "3h 47m 22s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043541}
|
| 157 |
+
{"loss": 0.43904352, "grad_norm": 2.45751856, "learning_rate": 4.66e-06, "token_acc": 0.85044581, "epoch": 0.41922563, "global_step/max_steps": "157/750", "percentage": "20.93%", "elapsed_time": "1h 0m 4s", "remaining_time": "3h 46m 54s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043556}
|
| 158 |
+
{"loss": 0.47703558, "grad_norm": 2.52444432, "learning_rate": 4.66e-06, "token_acc": 0.8391282, "epoch": 0.42189586, "global_step/max_steps": "158/750", "percentage": "21.07%", "elapsed_time": "1h 0m 24s", "remaining_time": "3h 46m 20s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043592}
|
| 159 |
+
{"loss": 0.46014819, "grad_norm": 2.58692895, "learning_rate": 4.65e-06, "token_acc": 0.84484124, "epoch": 0.42456609, "global_step/max_steps": "159/750", "percentage": "21.20%", "elapsed_time": "1h 0m 48s", "remaining_time": "3h 46m 2s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043575}
|
| 160 |
+
{"loss": 0.44626755, "grad_norm": 2.5726889, "learning_rate": 4.65e-06, "token_acc": 0.85418391, "epoch": 0.42723632, "global_step/max_steps": "160/750", "percentage": "21.33%", "elapsed_time": "1h 1m 11s", "remaining_time": "3h 45m 39s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043577}
|
| 161 |
+
{"loss": 0.43762302, "grad_norm": 2.57624818, "learning_rate": 4.64e-06, "token_acc": 0.85414863, "epoch": 0.42990654, "global_step/max_steps": "161/750", "percentage": "21.47%", "elapsed_time": "1h 1m 33s", "remaining_time": "3h 45m 13s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043586}
|
| 162 |
+
{"loss": 0.45935273, "grad_norm": 2.4689486, "learning_rate": 4.64e-06, "token_acc": 0.85112906, "epoch": 0.43257677, "global_step/max_steps": "162/750", "percentage": "21.60%", "elapsed_time": "1h 1m 56s", "remaining_time": "3h 44m 48s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043592}
|
| 163 |
+
{"loss": 0.42275417, "grad_norm": 2.43153137, "learning_rate": 4.63e-06, "token_acc": 0.8603887, "epoch": 0.435247, "global_step/max_steps": "163/750", "percentage": "21.73%", "elapsed_time": "1h 2m 20s", "remaining_time": "3h 44m 29s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04358}
|
| 164 |
+
{"loss": 0.40657133, "grad_norm": 2.68355566, "learning_rate": 4.62e-06, "token_acc": 0.86099553, "epoch": 0.43791722, "global_step/max_steps": "164/750", "percentage": "21.87%", "elapsed_time": "1h 2m 40s", "remaining_time": "3h 43m 57s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.04361}
|
| 165 |
+
{"loss": 0.4761599, "grad_norm": 2.48708559, "learning_rate": 4.62e-06, "token_acc": 0.84103781, "epoch": 0.44058745, "global_step/max_steps": "165/750", "percentage": "22.00%", "elapsed_time": "1h 3m 7s", "remaining_time": "3h 43m 48s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043563}
|
| 166 |
+
{"loss": 0.47588837, "grad_norm": 2.76328807, "learning_rate": 4.61e-06, "token_acc": 0.84062725, "epoch": 0.44325768, "global_step/max_steps": "166/750", "percentage": "22.13%", "elapsed_time": "1h 3m 32s", "remaining_time": "3h 43m 31s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043545}
|
| 167 |
+
{"loss": 0.4535282, "grad_norm": 2.5396113, "learning_rate": 4.61e-06, "token_acc": 0.84282684, "epoch": 0.4459279, "global_step/max_steps": "167/750", "percentage": "22.27%", "elapsed_time": "1h 3m 55s", "remaining_time": "3h 43m 9s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043541}
|
| 168 |
+
{"loss": 0.41487816, "grad_norm": 2.47265134, "learning_rate": 4.6e-06, "token_acc": 0.85979295, "epoch": 0.44859813, "global_step/max_steps": "168/750", "percentage": "22.40%", "elapsed_time": "1h 4m 19s", "remaining_time": "3h 42m 52s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043524}
|
| 169 |
+
{"loss": 0.44197559, "grad_norm": 2.55595802, "learning_rate": 4.59e-06, "token_acc": 0.84797686, "epoch": 0.45126836, "global_step/max_steps": "169/750", "percentage": "22.53%", "elapsed_time": "1h 4m 39s", "remaining_time": "3h 42m 16s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043563}
|
| 170 |
+
{"loss": 0.42982167, "grad_norm": 2.3608869, "learning_rate": 4.59e-06, "token_acc": 0.85456234, "epoch": 0.45393858, "global_step/max_steps": "170/750", "percentage": "22.67%", "elapsed_time": "1h 5m 2s", "remaining_time": "3h 41m 54s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043562}
|
| 171 |
+
{"loss": 0.42837453, "grad_norm": 2.58920862, "learning_rate": 4.58e-06, "token_acc": 0.85819733, "epoch": 0.45660881, "global_step/max_steps": "171/750", "percentage": "22.80%", "elapsed_time": "1h 5m 22s", "remaining_time": "3h 41m 20s", "memory(GiB)": 90.01, "train_speed(iter/s)": 0.043598}
|
v6-20250917-134949/runs/events.out.tfevents.1758088221.TENCENT64.site.222971.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a821e778e9c842081277c225575c484a82719f4a958746cc7317fbb0ee2c29ee
|
| 3 |
+
size 52504
|
v6-20250917-134949/val_dataset.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|