### model model_name_or_path: /shared_workspace_mfs/ximing/Qwen3-8B-Base trust_remote_code: true ### method stage: sft do_train: true finetuning_type: full deepspeed: /shared_workspace_mfs/ximing/LLaMA-Factory/examples/deepspeed/ds_z3_offload_config.json ### dataset dataset: test_10_samples template: qwen3_nothink cutoff_len: 8192 # 降低 cutoff_len,减少内存使用 rope_scaling: yarn overwrite_cache: true preprocessing_num_workers: 4 # 减少 worker 数量 dataloader_num_workers: 2 ### output output_dir: /shared_workspace_mfs/ximing/sft_test_10_samples_debug logging_steps: 1 save_strategy: steps save_steps: 10 plot_loss: true overwrite_output_dir: true report_to: none # 不使用 wandb,减少开销 ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 4 # 减少梯度累积步数 learning_rate: 5e-5 num_train_epochs: 1 # 只训练 1 个 epoch 用于测试 lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 enable_liger_kernel: false # 关闭 liger kernel,避免潜在问题 use_unsloth_gc: false # 关闭 unsloth gc,避免潜在问题 flash_attn: fa2