Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- ms-swift/examples/train/multi-gpu/device_map/train.sh +25 -0
- ms-swift/examples/train/multimodal/grounding.sh +27 -0
- ms-swift/examples/train/multimodal/lora_llm_full_vit/sft.sh +30 -0
- ms-swift/examples/train/multimodal/rlhf/dpo.sh +33 -0
- ms-swift/examples/train/rlhf/ppo.sh +33 -0
- ms-swift/examples/train/seq_cls/qwen2_5/sft.sh +28 -0
- ms-swift/examples/train/seq_cls/qwen2_vl/infer.sh +5 -0
- ms-swift/examples/train/tuners/adapter/train.sh +16 -0
- ms-swift/examples/train/tuners/boft/train.sh +16 -0
- ms-swift/examples/train/tuners/dora/train.sh +19 -0
- ms-swift/examples/train/tuners/galore/train_galore.sh +18 -0
- ms-swift/examples/train/tuners/llamapro/train.sh +17 -0
- ms-swift/examples/train/tuners/olora/train.sh +19 -0
- ms-swift/examples/train/tuners/pissa/train.sh +19 -0
- ms-swift/examples/train/tuners/qlora/train.sh +19 -0
- ms-swift/examples/train/tuners/reft/train.sh +17 -0
- ms-swift/ms_swift.egg-info/PKG-INFO +545 -0
- ms-swift/ms_swift.egg-info/not-zip-safe +1 -0
- ms-swift/requirements/install_all.sh +12 -0
- ms-swift/requirements/seq_parallel.txt +1 -0
- ms-swift/requirements/swanlab.txt +1 -0
- ms-swift/scripts/benchmark/config/tuner.json +301 -0
- ms-swift/scripts/benchmark/exp.py +50 -0
- ms-swift/scripts/benchmark/generate_report.py +433 -0
- ms-swift/scripts/utils/run_dataset_info.py +106 -0
- ms-swift/scripts/utils/run_template.py +8 -0
- ms-swift/swift/__init__.py +55 -0
- ms-swift/swift/cli/__init__.py +0 -0
- ms-swift/swift/cli/__pycache__/__init__.cpython-310.pyc +0 -0
- ms-swift/swift/cli/_megatron/pt.py +4 -0
- ms-swift/swift/cli/_megatron/sft.py +4 -0
- ms-swift/swift/cli/app.py +4 -0
- ms-swift/swift/cli/eval.py +5 -0
- ms-swift/swift/cli/export.py +5 -0
- ms-swift/swift/cli/main.py +76 -0
- ms-swift/swift/cli/pt.py +5 -0
- ms-swift/swift/cli/rollout.py +5 -0
- ms-swift/swift/hub/__pycache__/__init__.cpython-310.pyc +0 -0
- ms-swift/swift/hub/__pycache__/hub.cpython-310.pyc +0 -0
- ms-swift/swift/llm/__pycache__/__init__.cpython-310.pyc +0 -0
- ms-swift/swift/llm/__pycache__/data_loader.cpython-310.pyc +0 -0
- ms-swift/swift/llm/app/__init__.py +1 -0
- ms-swift/swift/llm/argument/app_args.py +38 -0
- ms-swift/swift/llm/data_loader.py +105 -0
- ms-swift/swift/llm/dataset/dataset/__pycache__/mllm.cpython-310.pyc +0 -0
- ms-swift/swift/llm/dataset/dataset/llm.py +856 -0
- ms-swift/swift/llm/dataset/preprocessor/__pycache__/__init__.cpython-310.pyc +0 -0
- ms-swift/swift/llm/dataset/preprocessor/__pycache__/core.cpython-310.pyc +0 -0
- ms-swift/swift/llm/dataset/register.py +177 -0
- ms-swift/swift/llm/ds_config/zero0.json +31 -0
ms-swift/examples/train/multi-gpu/device_map/train.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 2 * 76GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0,1 \
|
| 3 |
+
MAX_PIXELS=1003520 \
|
| 4 |
+
swift sft \
|
| 5 |
+
--model Qwen/Qwen2.5-VL-72B-Instruct \
|
| 6 |
+
--dataset 'modelscope/coco_2014_caption:validation#20000' \
|
| 7 |
+
--train_type lora \
|
| 8 |
+
--torch_dtype bfloat16 \
|
| 9 |
+
--num_train_epochs 1 \
|
| 10 |
+
--per_device_train_batch_size 1 \
|
| 11 |
+
--per_device_eval_batch_size 1 \
|
| 12 |
+
--learning_rate 1e-4 \
|
| 13 |
+
--lora_rank 8 \
|
| 14 |
+
--lora_alpha 32 \
|
| 15 |
+
--target_modules all-linear \
|
| 16 |
+
--freeze_vit true \
|
| 17 |
+
--gradient_accumulation_steps 16 \
|
| 18 |
+
--eval_steps 100 \
|
| 19 |
+
--save_steps 100 \
|
| 20 |
+
--save_total_limit 2 \
|
| 21 |
+
--logging_steps 5 \
|
| 22 |
+
--max_length 2048 \
|
| 23 |
+
--output_dir output \
|
| 24 |
+
--warmup_ratio 0.05 \
|
| 25 |
+
--dataloader_num_workers 4
|
ms-swift/examples/train/multimodal/grounding.sh
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 20GiB
|
| 2 |
+
# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
|
| 3 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 4 |
+
MAX_PIXELS=1003520 \
|
| 5 |
+
swift sft \
|
| 6 |
+
--model Qwen/Qwen2-VL-7B-Instruct \
|
| 7 |
+
--dataset 'AI-ModelScope/coco#20000' \
|
| 8 |
+
--train_type lora \
|
| 9 |
+
--torch_dtype bfloat16 \
|
| 10 |
+
--num_train_epochs 1 \
|
| 11 |
+
--per_device_train_batch_size 1 \
|
| 12 |
+
--per_device_eval_batch_size 1 \
|
| 13 |
+
--learning_rate 1e-4 \
|
| 14 |
+
--lora_rank 8 \
|
| 15 |
+
--lora_alpha 32 \
|
| 16 |
+
--target_modules all-linear \
|
| 17 |
+
--freeze_vit true \
|
| 18 |
+
--gradient_accumulation_steps 16 \
|
| 19 |
+
--eval_steps 100 \
|
| 20 |
+
--save_steps 100 \
|
| 21 |
+
--save_total_limit 2 \
|
| 22 |
+
--logging_steps 5 \
|
| 23 |
+
--max_length 2048 \
|
| 24 |
+
--output_dir output \
|
| 25 |
+
--warmup_ratio 0.05 \
|
| 26 |
+
--dataloader_num_workers 4 \
|
| 27 |
+
--dataset_num_proc 4
|
ms-swift/examples/train/multimodal/lora_llm_full_vit/sft.sh
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 4 * 22GiB
|
| 2 |
+
# vit/merger lr 1e-5; llm lora lr 1e-4
|
| 3 |
+
NPROC_PER_NODE=4 \
|
| 4 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
|
| 5 |
+
MAX_PIXELS=1003520 \
|
| 6 |
+
swift sft \
|
| 7 |
+
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
| 8 |
+
--dataset 'AI-ModelScope/coco#20000' \
|
| 9 |
+
--train_type custom \
|
| 10 |
+
--optimizer custom \
|
| 11 |
+
--external_plugins 'examples/train/multimodal/lora_llm_full_vit/custom_plugin.py' \
|
| 12 |
+
--torch_dtype bfloat16 \
|
| 13 |
+
--num_train_epochs 1 \
|
| 14 |
+
--per_device_train_batch_size 1 \
|
| 15 |
+
--per_device_eval_batch_size 1 \
|
| 16 |
+
--learning_rate 1e-4 \
|
| 17 |
+
--lora_rank 16 \
|
| 18 |
+
--lora_alpha 32 \
|
| 19 |
+
--gradient_accumulation_steps 4 \
|
| 20 |
+
--eval_steps 100 \
|
| 21 |
+
--save_steps 100 \
|
| 22 |
+
--save_total_limit 2 \
|
| 23 |
+
--logging_steps 5 \
|
| 24 |
+
--max_length 8192 \
|
| 25 |
+
--output_dir output \
|
| 26 |
+
--warmup_ratio 0.05 \
|
| 27 |
+
--dataloader_num_workers 4 \
|
| 28 |
+
--dataset_num_proc 4 \
|
| 29 |
+
--deepspeed zero2 \
|
| 30 |
+
--save_only_model true
|
ms-swift/examples/train/multimodal/rlhf/dpo.sh
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 4*50GiB
|
| 2 |
+
# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
|
| 3 |
+
# --rlhf_type cpo/orpo/simpo/rm are also supported
|
| 4 |
+
nproc_per_node=2
|
| 5 |
+
|
| 6 |
+
CUDA_VISIBLE_DEVICES=0,1 \
|
| 7 |
+
NPROC_PER_NODE=$nproc_per_node \
|
| 8 |
+
MAX_PIXELS=1003520 \
|
| 9 |
+
swift rlhf \
|
| 10 |
+
--rlhf_type dpo \
|
| 11 |
+
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
| 12 |
+
--dataset 'swift/RLAIF-V-Dataset#20000' \
|
| 13 |
+
--train_type lora \
|
| 14 |
+
--torch_dtype bfloat16 \
|
| 15 |
+
--num_train_epochs 1 \
|
| 16 |
+
--per_device_train_batch_size 1 \
|
| 17 |
+
--per_device_eval_batch_size 1 \
|
| 18 |
+
--learning_rate 1e-4 \
|
| 19 |
+
--lora_rank 8 \
|
| 20 |
+
--lora_alpha 32 \
|
| 21 |
+
--target_modules all-linear \
|
| 22 |
+
--freeze_vit true \
|
| 23 |
+
--gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
|
| 24 |
+
--eval_steps 100 \
|
| 25 |
+
--save_steps 100 \
|
| 26 |
+
--save_total_limit 2 \
|
| 27 |
+
--deepspeed zero2 \
|
| 28 |
+
--logging_steps 5 \
|
| 29 |
+
--max_length 4096 \
|
| 30 |
+
--output_dir output \
|
| 31 |
+
--warmup_ratio 0.05 \
|
| 32 |
+
--dataloader_num_workers 4 \
|
| 33 |
+
--dataset_num_proc 4
|
ms-swift/examples/train/rlhf/ppo.sh
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Currently, it only supports the case where the model and reward_model use the same template/tokenizer.
|
| 2 |
+
# Currently, multimodal model PPO is not supported.
|
| 3 |
+
nproc_per_node=4
|
| 4 |
+
|
| 5 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
|
| 6 |
+
NPROC_PER_NODE=$nproc_per_node \
|
| 7 |
+
swift rlhf \
|
| 8 |
+
--rlhf_type ppo \
|
| 9 |
+
--model LLM-Research/Meta-Llama-3.1-8B-Instruct \
|
| 10 |
+
--reward_model 'AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2' \
|
| 11 |
+
--train_type lora \
|
| 12 |
+
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' 'AI-ModelScope/alpaca-gpt4-data-en#20000' \
|
| 13 |
+
--torch_dtype bfloat16 \
|
| 14 |
+
--num_train_epochs 1 \
|
| 15 |
+
--per_device_train_batch_size 1 \
|
| 16 |
+
--per_device_eval_batch_size 1 \
|
| 17 |
+
--learning_rate 1e-5 \
|
| 18 |
+
--lora_rank 8 \
|
| 19 |
+
--lora_alpha 32 \
|
| 20 |
+
--target_modules all-linear \
|
| 21 |
+
--gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
|
| 22 |
+
--eval_steps 100 \
|
| 23 |
+
--save_steps 100 \
|
| 24 |
+
--save_total_limit 2 \
|
| 25 |
+
--logging_steps 5 \
|
| 26 |
+
--max_length 2048 \
|
| 27 |
+
--output_dir output \
|
| 28 |
+
--warmup_ratio 0.05 \
|
| 29 |
+
--dataloader_num_workers 4 \
|
| 30 |
+
--deepspeed zero2 \
|
| 31 |
+
--response_length 512 \
|
| 32 |
+
--temperature 0.7 \
|
| 33 |
+
--dataset_num_proc 4
|
ms-swift/examples/train/seq_cls/qwen2_5/sft.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# If `num_labels` is provided, it will be considered a classification task,
|
| 2 |
+
# and AutoModelForSequenceClassification will be used to load the model.
|
| 3 |
+
# You can also specify `--model Qwen/Qwen2.5-0.5B-Instruct --use_chat_template true`.
|
| 4 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 5 |
+
swift sft \
|
| 6 |
+
--model Qwen/Qwen2.5-0.5B \
|
| 7 |
+
--train_type lora \
|
| 8 |
+
--dataset 'DAMO_NLP/jd:cls#2000' \
|
| 9 |
+
--torch_dtype bfloat16 \
|
| 10 |
+
--num_train_epochs 1 \
|
| 11 |
+
--per_device_train_batch_size 1 \
|
| 12 |
+
--per_device_eval_batch_size 1 \
|
| 13 |
+
--learning_rate 1e-4 \
|
| 14 |
+
--lora_rank 8 \
|
| 15 |
+
--lora_alpha 32 \
|
| 16 |
+
--target_modules all-linear \
|
| 17 |
+
--gradient_accumulation_steps 16 \
|
| 18 |
+
--eval_steps 50 \
|
| 19 |
+
--save_steps 50 \
|
| 20 |
+
--save_total_limit 2 \
|
| 21 |
+
--logging_steps 5 \
|
| 22 |
+
--max_length 2048 \
|
| 23 |
+
--output_dir output \
|
| 24 |
+
--warmup_ratio 0.05 \
|
| 25 |
+
--dataloader_num_workers 4 \
|
| 26 |
+
--num_labels 2 \
|
| 27 |
+
--task_type seq_cls \
|
| 28 |
+
--use_chat_template false
|
ms-swift/examples/train/seq_cls/qwen2_vl/infer.sh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 2 |
+
MAX_PIXELS=1003520 \
|
| 3 |
+
swift infer \
|
| 4 |
+
--adapters output/vx-xxx/checkpoint-xxx \
|
| 5 |
+
--load_data_args true
|
ms-swift/examples/train/tuners/adapter/train.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 17GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type adapter \
|
| 6 |
+
--dataset 'swift/self-cognition#1000' \
|
| 7 |
+
--num_train_epochs 1 \
|
| 8 |
+
--per_device_train_batch_size 1 \
|
| 9 |
+
--learning_rate 1e-4 \
|
| 10 |
+
--gradient_accumulation_steps 16 \
|
| 11 |
+
--eval_steps 100 \
|
| 12 |
+
--save_steps 100 \
|
| 13 |
+
--save_total_limit 2 \
|
| 14 |
+
--logging_steps 5 \
|
| 15 |
+
--model_author swift \
|
| 16 |
+
--model_name swift-robot
|
ms-swift/examples/train/tuners/boft/train.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 17GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type boft \
|
| 6 |
+
--dataset 'swift/self-cognition#1000' \
|
| 7 |
+
--num_train_epochs 1 \
|
| 8 |
+
--per_device_train_batch_size 1 \
|
| 9 |
+
--learning_rate 1e-4 \
|
| 10 |
+
--gradient_accumulation_steps 16 \
|
| 11 |
+
--eval_steps 100 \
|
| 12 |
+
--save_steps 100 \
|
| 13 |
+
--save_total_limit 2 \
|
| 14 |
+
--logging_steps 5 \
|
| 15 |
+
--model_author swift \
|
| 16 |
+
--model_name swift-robot
|
ms-swift/examples/train/tuners/dora/train.sh
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 17.2GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type lora \
|
| 6 |
+
--use_dora true \
|
| 7 |
+
--dataset 'swift/self-cognition#1000' \
|
| 8 |
+
--num_train_epochs 1 \
|
| 9 |
+
--per_device_train_batch_size 1 \
|
| 10 |
+
--learning_rate 1e-4 \
|
| 11 |
+
--lora_rank 8 \
|
| 12 |
+
--lora_alpha 32 \
|
| 13 |
+
--gradient_accumulation_steps 16 \
|
| 14 |
+
--eval_steps 100 \
|
| 15 |
+
--save_steps 100 \
|
| 16 |
+
--save_total_limit 2 \
|
| 17 |
+
--logging_steps 5 \
|
| 18 |
+
--model_author swift \
|
| 19 |
+
--model_name swift-robot
|
ms-swift/examples/train/tuners/galore/train_galore.sh
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 38GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type full \
|
| 6 |
+
--dataset 'swift/self-cognition#1000' \
|
| 7 |
+
--num_train_epochs 1 \
|
| 8 |
+
--per_device_train_batch_size 1 \
|
| 9 |
+
--learning_rate 1e-5 \
|
| 10 |
+
--gradient_accumulation_steps 16 \
|
| 11 |
+
--eval_steps 100 \
|
| 12 |
+
--save_steps 100 \
|
| 13 |
+
--save_total_limit 2 \
|
| 14 |
+
--logging_steps 5 \
|
| 15 |
+
--model_author swift \
|
| 16 |
+
--model_name swift-robot \
|
| 17 |
+
--use_galore true \
|
| 18 |
+
--galore_optim_per_parameter true
|
ms-swift/examples/train/tuners/llamapro/train.sh
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 25.4GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type llamapro \
|
| 6 |
+
--dataset 'swift/self-cognition#1000' \
|
| 7 |
+
--llamapro_num_new_blocks 4 \
|
| 8 |
+
--num_train_epochs 1 \
|
| 9 |
+
--per_device_train_batch_size 1 \
|
| 10 |
+
--learning_rate 1e-4 \
|
| 11 |
+
--gradient_accumulation_steps 16 \
|
| 12 |
+
--eval_steps 100 \
|
| 13 |
+
--save_steps 100 \
|
| 14 |
+
--save_total_limit 2 \
|
| 15 |
+
--logging_steps 5 \
|
| 16 |
+
--model_author swift \
|
| 17 |
+
--model_name swift-robot
|
ms-swift/examples/train/tuners/olora/train.sh
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 17GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type lora \
|
| 6 |
+
--dataset 'swift/self-cognition#1000' \
|
| 7 |
+
--num_train_epochs 1 \
|
| 8 |
+
--per_device_train_batch_size 1 \
|
| 9 |
+
--learning_rate 1e-4 \
|
| 10 |
+
--lora_rank 8 \
|
| 11 |
+
--lora_alpha 32 \
|
| 12 |
+
--init_lora_weights olora \
|
| 13 |
+
--gradient_accumulation_steps 16 \
|
| 14 |
+
--eval_steps 100 \
|
| 15 |
+
--save_steps 100 \
|
| 16 |
+
--save_total_limit 2 \
|
| 17 |
+
--logging_steps 5 \
|
| 18 |
+
--model_author swift \
|
| 19 |
+
--model_name swift-robot
|
ms-swift/examples/train/tuners/pissa/train.sh
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 17GiB
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 3 |
+
swift sft \
|
| 4 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 5 |
+
--train_type lora \
|
| 6 |
+
--dataset 'swift/self-cognition#1000' \
|
| 7 |
+
--num_train_epochs 1 \
|
| 8 |
+
--per_device_train_batch_size 1 \
|
| 9 |
+
--learning_rate 1e-4 \
|
| 10 |
+
--lora_rank 8 \
|
| 11 |
+
--lora_alpha 32 \
|
| 12 |
+
--init_lora_weights pissa \
|
| 13 |
+
--gradient_accumulation_steps 16 \
|
| 14 |
+
--eval_steps 100 \
|
| 15 |
+
--save_steps 100 \
|
| 16 |
+
--save_total_limit 2 \
|
| 17 |
+
--logging_steps 5 \
|
| 18 |
+
--model_author swift \
|
| 19 |
+
--model_name swift-robot
|
ms-swift/examples/train/tuners/qlora/train.sh
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 2 |
+
swift sft \
|
| 3 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 4 |
+
--train_type lora \
|
| 5 |
+
--dataset 'swift/self-cognition#1000' \
|
| 6 |
+
--num_train_epochs 1 \
|
| 7 |
+
--per_device_train_batch_size 1 \
|
| 8 |
+
--learning_rate 1e-4 \
|
| 9 |
+
--lora_rank 8 \
|
| 10 |
+
--lora_alpha 32 \
|
| 11 |
+
--gradient_accumulation_steps 16 \
|
| 12 |
+
--eval_steps 100 \
|
| 13 |
+
--save_steps 100 \
|
| 14 |
+
--save_total_limit 2 \
|
| 15 |
+
--logging_steps 5 \
|
| 16 |
+
--model_author swift \
|
| 17 |
+
--model_name swift-robot \
|
| 18 |
+
--quant_bits 4 \
|
| 19 |
+
--quant_method bnb
|
ms-swift/examples/train/tuners/reft/train.sh
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 2 |
+
swift sft \
|
| 3 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 4 |
+
--train_type reft \
|
| 5 |
+
--dataset 'swift/self-cognition#1000' \
|
| 6 |
+
--reft_intervention_type 'LoreftIntervention' \
|
| 7 |
+
--num_train_epochs 1 \
|
| 8 |
+
--per_device_train_batch_size 1 \
|
| 9 |
+
--learning_rate 1e-4 \
|
| 10 |
+
--gradient_checkpointing false \
|
| 11 |
+
--gradient_accumulation_steps 16 \
|
| 12 |
+
--eval_steps 100 \
|
| 13 |
+
--save_steps 100 \
|
| 14 |
+
--save_total_limit 2 \
|
| 15 |
+
--logging_steps 5 \
|
| 16 |
+
--model_author swift \
|
| 17 |
+
--model_name swift-robot
|
ms-swift/ms_swift.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: ms_swift
|
| 3 |
+
Version: 3.5.0.dev0
|
| 4 |
+
Summary: Swift: Scalable lightWeight Infrastructure for Fine-Tuning
|
| 5 |
+
Home-page: https://github.com/modelscope/swift
|
| 6 |
+
Author: DAMO ModelScope teams
|
| 7 |
+
Author-email: contact@modelscope.cn
|
| 8 |
+
License: Apache License 2.0
|
| 9 |
+
Keywords: python,petl,efficient tuners
|
| 10 |
+
Platform: UNKNOWN
|
| 11 |
+
Classifier: Development Status :: 4 - Beta
|
| 12 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
| 13 |
+
Classifier: Operating System :: OS Independent
|
| 14 |
+
Classifier: Programming Language :: Python :: 3
|
| 15 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 16 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 20 |
+
Description-Content-Type: text/markdown
|
| 21 |
+
License-File: LICENSE
|
| 22 |
+
Requires-Dist: accelerate
|
| 23 |
+
Requires-Dist: addict
|
| 24 |
+
Requires-Dist: aiohttp
|
| 25 |
+
Requires-Dist: attrdict
|
| 26 |
+
Requires-Dist: binpacking
|
| 27 |
+
Requires-Dist: charset_normalizer
|
| 28 |
+
Requires-Dist: cpm_kernels
|
| 29 |
+
Requires-Dist: dacite
|
| 30 |
+
Requires-Dist: datasets<3.4,>=3.0
|
| 31 |
+
Requires-Dist: einops
|
| 32 |
+
Requires-Dist: fastapi
|
| 33 |
+
Requires-Dist: gradio>=3.40.0
|
| 34 |
+
Requires-Dist: importlib_metadata
|
| 35 |
+
Requires-Dist: jieba
|
| 36 |
+
Requires-Dist: matplotlib
|
| 37 |
+
Requires-Dist: modelscope>=1.23
|
| 38 |
+
Requires-Dist: nltk
|
| 39 |
+
Requires-Dist: numpy<2.0
|
| 40 |
+
Requires-Dist: openai
|
| 41 |
+
Requires-Dist: oss2
|
| 42 |
+
Requires-Dist: pandas
|
| 43 |
+
Requires-Dist: peft<0.16,>=0.11
|
| 44 |
+
Requires-Dist: pillow
|
| 45 |
+
Requires-Dist: requests
|
| 46 |
+
Requires-Dist: rouge
|
| 47 |
+
Requires-Dist: safetensors
|
| 48 |
+
Requires-Dist: scipy
|
| 49 |
+
Requires-Dist: sentencepiece
|
| 50 |
+
Requires-Dist: simplejson>=3.3.0
|
| 51 |
+
Requires-Dist: sortedcontainers>=1.5.9
|
| 52 |
+
Requires-Dist: tensorboard
|
| 53 |
+
Requires-Dist: tiktoken
|
| 54 |
+
Requires-Dist: tqdm
|
| 55 |
+
Requires-Dist: transformers<4.53,>=4.33
|
| 56 |
+
Requires-Dist: transformers_stream_generator
|
| 57 |
+
Requires-Dist: trl<0.18,>=0.13
|
| 58 |
+
Requires-Dist: uvicorn
|
| 59 |
+
Requires-Dist: zstandard
|
| 60 |
+
Provides-Extra: eval
|
| 61 |
+
Requires-Dist: evalscope[opencompass]; extra == "eval"
|
| 62 |
+
Requires-Dist: evalscope[vlmeval]; extra == "eval"
|
| 63 |
+
Provides-Extra: swanlab
|
| 64 |
+
Requires-Dist: swanlab; extra == "swanlab"
|
| 65 |
+
Provides-Extra: seq-parallel
|
| 66 |
+
Requires-Dist: xtuner; extra == "seq-parallel"
|
| 67 |
+
Provides-Extra: all
|
| 68 |
+
Requires-Dist: accelerate; extra == "all"
|
| 69 |
+
Requires-Dist: addict; extra == "all"
|
| 70 |
+
Requires-Dist: aiohttp; extra == "all"
|
| 71 |
+
Requires-Dist: attrdict; extra == "all"
|
| 72 |
+
Requires-Dist: binpacking; extra == "all"
|
| 73 |
+
Requires-Dist: charset_normalizer; extra == "all"
|
| 74 |
+
Requires-Dist: cpm_kernels; extra == "all"
|
| 75 |
+
Requires-Dist: dacite; extra == "all"
|
| 76 |
+
Requires-Dist: datasets<3.4,>=3.0; extra == "all"
|
| 77 |
+
Requires-Dist: einops; extra == "all"
|
| 78 |
+
Requires-Dist: fastapi; extra == "all"
|
| 79 |
+
Requires-Dist: gradio>=3.40.0; extra == "all"
|
| 80 |
+
Requires-Dist: importlib_metadata; extra == "all"
|
| 81 |
+
Requires-Dist: jieba; extra == "all"
|
| 82 |
+
Requires-Dist: matplotlib; extra == "all"
|
| 83 |
+
Requires-Dist: modelscope>=1.23; extra == "all"
|
| 84 |
+
Requires-Dist: nltk; extra == "all"
|
| 85 |
+
Requires-Dist: numpy<2.0; extra == "all"
|
| 86 |
+
Requires-Dist: openai; extra == "all"
|
| 87 |
+
Requires-Dist: oss2; extra == "all"
|
| 88 |
+
Requires-Dist: pandas; extra == "all"
|
| 89 |
+
Requires-Dist: peft<0.16,>=0.11; extra == "all"
|
| 90 |
+
Requires-Dist: pillow; extra == "all"
|
| 91 |
+
Requires-Dist: requests; extra == "all"
|
| 92 |
+
Requires-Dist: rouge; extra == "all"
|
| 93 |
+
Requires-Dist: safetensors; extra == "all"
|
| 94 |
+
Requires-Dist: scipy; extra == "all"
|
| 95 |
+
Requires-Dist: sentencepiece; extra == "all"
|
| 96 |
+
Requires-Dist: simplejson>=3.3.0; extra == "all"
|
| 97 |
+
Requires-Dist: sortedcontainers>=1.5.9; extra == "all"
|
| 98 |
+
Requires-Dist: tensorboard; extra == "all"
|
| 99 |
+
Requires-Dist: tiktoken; extra == "all"
|
| 100 |
+
Requires-Dist: tqdm; extra == "all"
|
| 101 |
+
Requires-Dist: transformers<4.53,>=4.33; extra == "all"
|
| 102 |
+
Requires-Dist: transformers_stream_generator; extra == "all"
|
| 103 |
+
Requires-Dist: trl<0.18,>=0.13; extra == "all"
|
| 104 |
+
Requires-Dist: uvicorn; extra == "all"
|
| 105 |
+
Requires-Dist: zstandard; extra == "all"
|
| 106 |
+
Requires-Dist: evalscope[opencompass]; extra == "all"
|
| 107 |
+
Requires-Dist: evalscope[vlmeval]; extra == "all"
|
| 108 |
+
Requires-Dist: xtuner; extra == "all"
|
| 109 |
+
Requires-Dist: swanlab; extra == "all"
|
| 110 |
+
Dynamic: author
|
| 111 |
+
Dynamic: author-email
|
| 112 |
+
Dynamic: classifier
|
| 113 |
+
Dynamic: description
|
| 114 |
+
Dynamic: description-content-type
|
| 115 |
+
Dynamic: home-page
|
| 116 |
+
Dynamic: keywords
|
| 117 |
+
Dynamic: license
|
| 118 |
+
Dynamic: license-file
|
| 119 |
+
Dynamic: provides-extra
|
| 120 |
+
Dynamic: requires-dist
|
| 121 |
+
Dynamic: summary
|
| 122 |
+
|
| 123 |
+
# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
|
| 124 |
+
|
| 125 |
+
<p align="center">
|
| 126 |
+
<br>
|
| 127 |
+
<img src="asset/banner.png"/>
|
| 128 |
+
<br>
|
| 129 |
+
<p>
|
| 130 |
+
<p align="center">
|
| 131 |
+
<a href="https://modelscope.cn/home">ModelScope Community Website</a>
|
| 132 |
+
<br>
|
| 133 |
+
<a href="README_CN.md">中文</a>   |   English  
|
| 134 |
+
</p>
|
| 135 |
+
|
| 136 |
+
<p align="center">
|
| 137 |
+
<img src="https://img.shields.io/badge/python-3.10-5be.svg">
|
| 138 |
+
<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
|
| 139 |
+
<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.19-5D91D4.svg"></a>
|
| 140 |
+
<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
|
| 141 |
+
<a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
|
| 142 |
+
<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
|
| 143 |
+
<a href="https://github.com/modelscope/swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
| 144 |
+
</p>
|
| 145 |
+
|
| 146 |
+
<p align="center">
|
| 147 |
+
<a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
| 148 |
+
</p>
|
| 149 |
+
|
| 150 |
+
<p align="center">
|
| 151 |
+
<a href="https://arxiv.org/abs/2408.05517">Paper</a>   | <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a>   |   <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a>  
|
| 152 |
+
</p>
|
| 153 |
+
|
| 154 |
+
## 📖 Table of Contents
|
| 155 |
+
- [Groups](#-Groups)
|
| 156 |
+
- [Introduction](#-introduction)
|
| 157 |
+
- [News](#-news)
|
| 158 |
+
- [Installation](#%EF%B8%8F-installation)
|
| 159 |
+
- [Quick Start](#-quick-Start)
|
| 160 |
+
- [Usage](#-Usage)
|
| 161 |
+
- [License](#-License)
|
| 162 |
+
- [Citation](#-citation)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
## ☎ Groups
|
| 166 |
+
|
| 167 |
+
You can contact us and communicate with us by adding our group:
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
[Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group
|
| 171 |
+
:-------------------------:|:-------------------------:
|
| 172 |
+
<img src="asset/discord_qr.jpg" width="200" height="200"> | <img src="asset/wechat.png" width="200" height="200">
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
## 📝 Introduction
|
| 176 |
+
🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 500+ large models and 200+ multi-modal large models. These large language models (LLMs) include models such as Qwen3, Qwen3-MoE, Qwen2.5, InternLM3, GLM4, Mistral, DeepSeek-R1, Yi1.5, TeleChat2, Baichuan2, and Gemma2. The multi-modal LLMs include models such as Qwen2.5-VL, Qwen2-Audio, Llama3.4, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2.
|
| 177 |
+
|
| 178 |
+
🍔 Additionally, ms-swift incorporates the latest training technologies, including lightweight techniques such as LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger, as well as human alignment training methods like DPO, GRPO, RM, PPO, KTO, CPO, SimPO, and ORPO. ms-swift supports acceleration of inference, evaluation, and deployment modules using vLLM and LMDeploy, and it supports model quantization with technologies like GPTQ, AWQ, and BNB. Furthermore, ms-swift offers a Gradio-based Web UI and a wealth of best practices.
|
| 179 |
+
|
| 180 |
+
**Why choose ms-swift?**
|
| 181 |
+
|
| 182 |
+
- 🍎 **Model Types**: Supports 500+ pure text large models, **200+ multi-modal large models**, as well as All-to-All multi-modal models, sequence classification models, and embedding models, **covering the entire process from training to deployment**.
|
| 183 |
+
- **Dataset Types**: Comes with 150+ pre-training, fine-tuning, human alignment, multi-modal datasets, and supports custom datasets.
|
| 184 |
+
- **Hardware Support**: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, MPS, etc.
|
| 185 |
+
- 🍊 **Lightweight Training**: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel.
|
| 186 |
+
- **Distributed Training**: Supports distributed data parallel (DDP), device_map simple model parallelism, DeepSpeed ZeRO2/ZeRO3, FSDP, and other distributed training techniques.
|
| 187 |
+
- **Quantization Training**: Supports training quantized models like BNB, AWQ, GPTQ, AQLM, HQQ, EETQ.
|
| 188 |
+
- **RLHF Training**: Supports human alignment training methods such as DPO, GRPO, RM, PPO, KTO, CPO, SimPO, ORPO for both pure text and multi-modal large models.
|
| 189 |
+
- 🍓 **Multi-Modal Training**: Supports training on different modalities like images, videos, and audio, for tasks like VQA, captioning, OCR, and grounding.
|
| 190 |
+
- **Interface Training**: Provides capabilities for training, inference, evaluation, quantization through an interface, completing the whole large model pipeline.
|
| 191 |
+
- **Plugin and Extension**: Supports custom model and dataset extensions, as well as customization of components like loss, metric, trainer, loss-scale, callback, optimizer.
|
| 192 |
+
- 🍉 **Toolbox Capabilities**: Offers not only training support for large models and multi-modal large models but also covers the entire process of inference, evaluation, quantization, and deployment.
|
| 193 |
+
- **Inference Acceleration**: Supports inference acceleration engines like PyTorch, vLLM, LmDeploy, and provides OpenAI API for accelerating inference, deployment, and evaluation modules.
|
| 194 |
+
- **Model Evaluation**: Uses EvalScope as the evaluation backend and supports evaluation on 100+ datasets for both pure text and multi-modal models.
|
| 195 |
+
- **Model Quantization**: Supports AWQ, GPTQ, and BNB quantized exports, with models that can use vLLM/LmDeploy for inference acceleration and continue training.
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
## 🎉 News
|
| 199 |
+
- ��� 2025.05.11: GRPO now supports custom processing logic for reward models. See the GenRM example [here](./docs/source_en/Instruction/GRPO.md#customized-reward-models) .
|
| 200 |
+
- 🎁 2025.04.15: The ms-swift paper has been accepted by AAAI 2025. You can find the paper at [this link](https://ojs.aaai.org/index.php/AAAI/article/view/35383).
|
| 201 |
+
- 🎁 2025.03.23: Multi-round GRPO is now supported for training multi-turn dialogue scenarios (e.g., agent tool calling). Please refer to the [training script](https://idealab.alibaba-inc.com/examples/train/grpo/internal/train_multi_round.sh).
|
| 202 |
+
- 🎁 2025.03.16: Support for Megatron's parallel training techniques is now available. Please see the [Megatron-SWIFT training documentation](https://swift.readthedocs.io/zh-cn/latest/Instruction/Megatron-SWIFT训练.html).
|
| 203 |
+
- 🎁 2025.03.15: Fine-tuning of embedding models for both pure text and multimodal models is supported. Please check the [training script](https://idealab.alibaba-inc.com/examples/train/embedding).
|
| 204 |
+
- 🎁 2025.03.05: The hybrid mode for GRPO is supported, with a script for training a 72B model on 4 GPUs (4*80G) available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/train_72b_4gpu.sh). Tensor parallelism with vllm is also supported, with the training script available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/multi_gpu_mp_colocate.sh).
|
| 205 |
+
- 🎁 2025.02.21: The GRPO algorithm now supports LMDeploy, with the training script available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/full_lmdeploy.sh). Additionally, the performance of the GRPO algorithm has been tested, achieving a training speed increase of up to 300% using various tricks. Please check the WanDB table [here](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz).
|
| 206 |
+
- 🎁 2025.02.21: The `swift sample` command is now supported. The reinforcement fine-tuning script can be found [here](https://idealab.alibaba-inc.com/docs/source/Instruction/强化微调.md), and the large model API distillation sampling script is available [here](https://idealab.alibaba-inc.com/examples/sampler/distill/distill.sh).
|
| 207 |
+
- 🔥 2025.02.12: Support for the GRPO (Group Relative Policy Optimization) training algorithm has been added. Documentation is available [here](https://idealab.alibaba-inc.com/docs/source/Instruction/GRPO.md).
|
| 208 |
+
- 🎁 2024.12.04: Major update to **ms-swift 3.0**. Please refer to the [release notes and changes](https://swift.readthedocs.io/zh-cn/latest/Instruction/ReleaseNote3.0.html).
|
| 209 |
+
<details><summary>More</summary>
|
| 210 |
+
|
| 211 |
+
- 🎉 2024.08.12: The ms-swift paper has been published on arXiv and can be read [here](https://arxiv.org/abs/2408.05517).
|
| 212 |
+
- 🔥 2024.08.05: Support for using [evalscope](https://github.com/modelscope/evalscope/) as a backend for evaluating large models and multimodal models.
|
| 213 |
+
- 🔥 2024.07.29: Support for using [vllm](https://github.com/vllm-project/vllm) and [lmdeploy](https://github.com/InternLM/lmdeploy) to accelerate inference for large models and multimodal models. When performing infer/deploy/eval, you can specify `--infer_backend vllm/lmdeploy`.
|
| 214 |
+
- 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM/PPO.
|
| 215 |
+
- 🔥 2024.02.01: Support for Agent training! The training algorithm is derived from [this paper](https://arxiv.org/pdf/2309.00986.pdf).
|
| 216 |
+
</details>
|
| 217 |
+
|
| 218 |
+
## 🛠️ Installation
|
| 219 |
+
To install using pip:
|
| 220 |
+
```shell
|
| 221 |
+
pip install ms-swift -U
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
To install from source:
|
| 225 |
+
```shell
|
| 226 |
+
# pip install git+https://github.com/modelscope/ms-swift.git
|
| 227 |
+
|
| 228 |
+
git clone https://github.com/modelscope/ms-swift.git
|
| 229 |
+
cd ms-swift
|
| 230 |
+
pip install -e .
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
Running Environment:
|
| 234 |
+
|
| 235 |
+
| | Range | Recommended | Notes |
|
| 236 |
+
| ------------ |--------------| ----------- | ----------------------------------------- |
|
| 237 |
+
| python | >=3.9 | 3.10 | |
|
| 238 |
+
| cuda | | cuda12 | No need to install if using CPU, NPU, MPS |
|
| 239 |
+
| torch | >=2.0 | | |
|
| 240 |
+
| transformers | >=4.33 | 4.51 | |
|
| 241 |
+
| modelscope | >=1.23 | | |
|
| 242 |
+
| peft | >=0.11,<0.16 | ||
|
| 243 |
+
| trl | >=0.13,<0.18 | 0.17 |RLHF|
|
| 244 |
+
| deepspeed | >=0.14 | 0.14.5 | Training |
|
| 245 |
+
| vllm | >=0.5.1 | 0.7.3/0.8 | Inference/Deployment/Evaluation |
|
| 246 |
+
| lmdeploy | >=0.5 | 0.8 | Inference/Deployment/Evaluation |
|
| 247 |
+
| evalscope | >=0.11 | | Evaluation |
|
| 248 |
+
|
| 249 |
+
For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
## 🚀 Quick Start
|
| 253 |
+
|
| 254 |
+
10 minutes of self-cognition fine-tuning of Qwen2.5-7B-Instruct on a single 3090 GPU:
|
| 255 |
+
|
| 256 |
+
### Command Line Interface
|
| 257 |
+
|
| 258 |
+
```shell
|
| 259 |
+
# 22GB
|
| 260 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 261 |
+
swift sft \
|
| 262 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 263 |
+
--train_type lora \
|
| 264 |
+
--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
|
| 265 |
+
'AI-ModelScope/alpaca-gpt4-data-en#500' \
|
| 266 |
+
'swift/self-cognition#500' \
|
| 267 |
+
--torch_dtype bfloat16 \
|
| 268 |
+
--num_train_epochs 1 \
|
| 269 |
+
--per_device_train_batch_size 1 \
|
| 270 |
+
--per_device_eval_batch_size 1 \
|
| 271 |
+
--learning_rate 1e-4 \
|
| 272 |
+
--lora_rank 8 \
|
| 273 |
+
--lora_alpha 32 \
|
| 274 |
+
--target_modules all-linear \
|
| 275 |
+
--gradient_accumulation_steps 16 \
|
| 276 |
+
--eval_steps 50 \
|
| 277 |
+
--save_steps 50 \
|
| 278 |
+
--save_total_limit 2 \
|
| 279 |
+
--logging_steps 5 \
|
| 280 |
+
--max_length 2048 \
|
| 281 |
+
--output_dir output \
|
| 282 |
+
--system 'You are a helpful assistant.' \
|
| 283 |
+
--warmup_ratio 0.05 \
|
| 284 |
+
--dataloader_num_workers 4 \
|
| 285 |
+
--model_author swift \
|
| 286 |
+
--model_name swift-robot
|
| 287 |
+
```
|
| 288 |
+
|
| 289 |
+
Tips:
|
| 290 |
+
|
| 291 |
+
- If you want to train with a custom dataset, you can refer to [this guide](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) to organize your dataset format and specify `--dataset <dataset_path>`.
|
| 292 |
+
- The `--model_author` and `--model_name` parameters are only effective when the dataset includes `swift/self-cognition`.
|
| 293 |
+
- To train with a different model, simply modify `--model <model_id/model_path>`.
|
| 294 |
+
- By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
|
| 295 |
+
|
| 296 |
+
After training is complete, use the following command to infer with the trained weights:
|
| 297 |
+
|
| 298 |
+
- Here, `--adapters` should be replaced with the last checkpoint folder generated during training. Since the adapters folder contains the training parameter file `args.json`, there is no need to specify `--model`, `--system` separately; Swift will automatically read these parameters. To disable this behavior, you can set `--load_args false`.
|
| 299 |
+
|
| 300 |
+
```shell
|
| 301 |
+
# Using an interactive command line for inference.
|
| 302 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 303 |
+
swift infer \
|
| 304 |
+
--adapters output/vx-xxx/checkpoint-xxx \
|
| 305 |
+
--stream true \
|
| 306 |
+
--temperature 0 \
|
| 307 |
+
--max_new_tokens 2048
|
| 308 |
+
|
| 309 |
+
# merge-lora and use vLLM for inference acceleration
|
| 310 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 311 |
+
swift infer \
|
| 312 |
+
--adapters output/vx-xxx/checkpoint-xxx \
|
| 313 |
+
--stream true \
|
| 314 |
+
--merge_lora true \
|
| 315 |
+
--infer_backend vllm \
|
| 316 |
+
--max_model_len 8192 \
|
| 317 |
+
--temperature 0 \
|
| 318 |
+
--max_new_tokens 2048
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
Finally, use the following command to push the model to ModelScope:
|
| 322 |
+
|
| 323 |
+
```shell
|
| 324 |
+
CUDA_VISIBLE_DEVICES=0 \
|
| 325 |
+
swift export \
|
| 326 |
+
--adapters output/vx-xxx/checkpoint-xxx \
|
| 327 |
+
--push_to_hub true \
|
| 328 |
+
--hub_model_id '<your-model-id>' \
|
| 329 |
+
--hub_token '<your-sdk-token>' \
|
| 330 |
+
--use_hf false
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
### Web-UI
|
| 335 |
+
The Web-UI is a **zero-threshold** training and deployment interface solution based on Gradio interface technology. For more details, you can check [here](https://swift.readthedocs.io/en/latest/GetStarted/Web-UI.html).
|
| 336 |
+
|
| 337 |
+
```shell
|
| 338 |
+
SWIFT_UI_LANG=en swift web-ui
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+

|
| 342 |
+
|
| 343 |
+
### Using Python
|
| 344 |
+
|
| 345 |
+
ms-swift also supports training and inference using Python. Below is pseudocode for training and inference. For more details, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb).
|
| 346 |
+
|
| 347 |
+
Training:
|
| 348 |
+
|
| 349 |
+
```python
|
| 350 |
+
# Retrieve the model and template, and add a trainable LoRA module
|
| 351 |
+
model, tokenizer = get_model_tokenizer(model_id_or_path, ...)
|
| 352 |
+
template = get_template(model.model_meta.template, tokenizer, ...)
|
| 353 |
+
model = Swift.prepare_model(model, lora_config)
|
| 354 |
+
|
| 355 |
+
# Download and load the dataset, and encode the text into tokens
|
| 356 |
+
train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
|
| 357 |
+
train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
|
| 358 |
+
val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
|
| 359 |
+
|
| 360 |
+
# Train the model
|
| 361 |
+
trainer = Seq2SeqTrainer(
|
| 362 |
+
model=model,
|
| 363 |
+
args=training_args,
|
| 364 |
+
data_collator=template.data_collator,
|
| 365 |
+
train_dataset=train_dataset,
|
| 366 |
+
eval_dataset=val_dataset,
|
| 367 |
+
template=template,
|
| 368 |
+
)
|
| 369 |
+
trainer.train()
|
| 370 |
+
```
|
| 371 |
+
Inference:
|
| 372 |
+
|
| 373 |
+
```python
|
| 374 |
+
# Perform inference using the native PyTorch engine
|
| 375 |
+
engine = PtEngine(model_id_or_path, adapters=[lora_checkpoint])
|
| 376 |
+
infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
|
| 377 |
+
request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
|
| 378 |
+
|
| 379 |
+
resp_list = engine.infer([infer_request], request_config)
|
| 380 |
+
print(f'response: {resp_list[0].choices[0].message.content}')
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
## ✨ Usage
|
| 384 |
+
Here is a minimal example of training to deployment using ms-swift. For more details, you can check the [examples](https://github.com/modelscope/ms-swift/tree/main/examples).
|
| 385 |
+
|
| 386 |
+
- If you want to use other models or datasets (including multimodal models and datasets), you only need to modify `--model` to specify the corresponding model's ID or path, and modify `--dataset` to specify the corresponding dataset's ID or path.
|
| 387 |
+
- By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
|
| 388 |
+
|
| 389 |
+
| Useful Links |
|
| 390 |
+
| ------ |
|
| 391 |
+
| [🔥Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html) |
|
| 392 |
+
| [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html) |
|
| 393 |
+
| [Custom Models](https://swift.readthedocs.io/en/latest/Customization/Custom-model.html), [🔥Custom Datasets](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) |
|
| 394 |
+
| [LLM Tutorial](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial) |
|
| 395 |
+
|
| 396 |
+
### Training
|
| 397 |
+
|
| 398 |
+
Supported Training Methods:
|
| 399 |
+
|
| 400 |
+
| Method | Full-Parameter | LoRA | QLoRA | Deepspeed | Multi-Node | Multi-Modal |
|
| 401 |
+
|------------------------------------|--------------------------------------------------------------|---------------------------------------------------------------------------------------------|--------------------------------------------------------------|--------------------------------------------------------------|--------------------------------------------------------------|----------------------------------------------------------------------------------------------|
|
| 402 |
+
| Pre-training | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain/train.sh) | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 403 |
+
| Instruction Supervised Fine-tuning | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
|
| 404 |
+
| DPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo.sh) |
|
| 405 |
+
| GRPO Training | [✅]((https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/grpo_zero2.sh)) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/multi_node) | ✅ |
|
| 406 |
+
| Reward Model Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ |
|
| 407 |
+
| PPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅ | ❌ |
|
| 408 |
+
| KTO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
|
| 409 |
+
| CPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ |
|
| 410 |
+
| SimPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ |
|
| 411 |
+
| ORPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ |
|
| 412 |
+
| Classification Model Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_5/sft.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_vl/sft.sh) |
|
| 413 |
+
| Embedding Model Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gte.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh) |
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
Pre-training:
|
| 418 |
+
```shell
|
| 419 |
+
# 8*A100
|
| 420 |
+
NPROC_PER_NODE=8 \
|
| 421 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
| 422 |
+
swift pt \
|
| 423 |
+
--model Qwen/Qwen2.5-7B \
|
| 424 |
+
--dataset swift/chinese-c4 \
|
| 425 |
+
--streaming true \
|
| 426 |
+
--train_type full \
|
| 427 |
+
--deepspeed zero2 \
|
| 428 |
+
--output_dir output \
|
| 429 |
+
--max_steps 10000 \
|
| 430 |
+
...
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
Fine-tuning:
|
| 434 |
+
```shell
|
| 435 |
+
CUDA_VISIBLE_DEVICES=0 swift sft \
|
| 436 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 437 |
+
--dataset AI-ModelScope/alpaca-gpt4-data-en \
|
| 438 |
+
--train_type lora \
|
| 439 |
+
--output_dir output \
|
| 440 |
+
...
|
| 441 |
+
```
|
| 442 |
+
|
| 443 |
+
RLHF:
|
| 444 |
+
```shell
|
| 445 |
+
CUDA_VISIBLE_DEVICES=0 swift rlhf \
|
| 446 |
+
--rlhf_type dpo \
|
| 447 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 448 |
+
--dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
|
| 449 |
+
--train_type lora \
|
| 450 |
+
--output_dir output \
|
| 451 |
+
...
|
| 452 |
+
```
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
### Inference
|
| 456 |
+
```shell
|
| 457 |
+
CUDA_VISIBLE_DEVICES=0 swift infer \
|
| 458 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 459 |
+
--stream true \
|
| 460 |
+
--infer_backend pt \
|
| 461 |
+
--max_new_tokens 2048
|
| 462 |
+
|
| 463 |
+
# LoRA
|
| 464 |
+
CUDA_VISIBLE_DEVICES=0 swift infer \
|
| 465 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 466 |
+
--adapters swift/test_lora \
|
| 467 |
+
--stream true \
|
| 468 |
+
--infer_backend pt \
|
| 469 |
+
--temperature 0 \
|
| 470 |
+
--max_new_tokens 2048
|
| 471 |
+
```
|
| 472 |
+
|
| 473 |
+
### Interface Inference
|
| 474 |
+
```shell
|
| 475 |
+
CUDA_VISIBLE_DEVICES=0 swift app \
|
| 476 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 477 |
+
--stream true \
|
| 478 |
+
--infer_backend pt \
|
| 479 |
+
--max_new_tokens 2048
|
| 480 |
+
```
|
| 481 |
+
|
| 482 |
+
### Deployment
|
| 483 |
+
```shell
|
| 484 |
+
CUDA_VISIBLE_DEVICES=0 swift deploy \
|
| 485 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 486 |
+
--infer_backend vllm
|
| 487 |
+
```
|
| 488 |
+
|
| 489 |
+
### Sampling
|
| 490 |
+
```shell
|
| 491 |
+
CUDA_VISIBLE_DEVICES=0 swift sample \
|
| 492 |
+
--model LLM-Research/Meta-Llama-3.1-8B-Instruct \
|
| 493 |
+
--sampler_engine pt \
|
| 494 |
+
--num_return_sequences 5 \
|
| 495 |
+
--dataset AI-ModelScope/alpaca-gpt4-data-zh#5
|
| 496 |
+
```
|
| 497 |
+
|
| 498 |
+
### Evaluation
|
| 499 |
+
```shell
|
| 500 |
+
CUDA_VISIBLE_DEVICES=0 swift eval \
|
| 501 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 502 |
+
--infer_backend lmdeploy \
|
| 503 |
+
--eval_backend OpenCompass \
|
| 504 |
+
--eval_dataset ARC_c
|
| 505 |
+
```
|
| 506 |
+
|
| 507 |
+
### Quantization
|
| 508 |
+
```shell
|
| 509 |
+
CUDA_VISIBLE_DEVICES=0 swift export \
|
| 510 |
+
--model Qwen/Qwen2.5-7B-Instruct \
|
| 511 |
+
--quant_bits 4 --quant_method awq \
|
| 512 |
+
--dataset AI-ModelScope/alpaca-gpt4-data-zh \
|
| 513 |
+
--output_dir Qwen2.5-7B-Instruct-AWQ
|
| 514 |
+
```
|
| 515 |
+
|
| 516 |
+
### Push Model
|
| 517 |
+
```shell
|
| 518 |
+
swift export \
|
| 519 |
+
--model <model-path> \
|
| 520 |
+
--push_to_hub true \
|
| 521 |
+
--hub_model_id '<model-id>' \
|
| 522 |
+
--hub_token '<sdk-token>'
|
| 523 |
+
```
|
| 524 |
+
|
| 525 |
+
## 🏛 License
|
| 526 |
+
|
| 527 |
+
This framework is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). For models and datasets, please refer to the original resource page and follow the corresponding License.
|
| 528 |
+
|
| 529 |
+
## 📎 Citation
|
| 530 |
+
|
| 531 |
+
```bibtex
|
| 532 |
+
@misc{zhao2024swiftascalablelightweightinfrastructure,
|
| 533 |
+
title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
|
| 534 |
+
author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
|
| 535 |
+
year={2024},
|
| 536 |
+
eprint={2408.05517},
|
| 537 |
+
archivePrefix={arXiv},
|
| 538 |
+
primaryClass={cs.CL},
|
| 539 |
+
url={https://arxiv.org/abs/2408.05517},
|
| 540 |
+
}
|
| 541 |
+
```
|
| 542 |
+
|
| 543 |
+
## Star History
|
| 544 |
+
|
| 545 |
+
[](https://star-history.com/#modelscope/ms-swift&Date)
|
ms-swift/ms_swift.egg-info/not-zip-safe
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
ms-swift/requirements/install_all.sh
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# please use python=3.10, cuda12.*
|
| 2 |
+
# sh requirements/install_all.sh
|
| 3 |
+
pip install "vllm>=0.5.1" -U
|
| 4 |
+
pip install "lmdeploy>=0.5" -U --no-deps
|
| 5 |
+
pip install autoawq -U --no-deps
|
| 6 |
+
pip install auto_gptq optimum bitsandbytes -U
|
| 7 |
+
pip install git+https://github.com/modelscope/ms-swift.git
|
| 8 |
+
pip install timm -U
|
| 9 |
+
pip install deepspeed -U
|
| 10 |
+
pip install qwen_vl_utils qwen_omni_utils decord librosa pyav icecream soundfile -U
|
| 11 |
+
pip install liger_kernel nvitop pre-commit -U
|
| 12 |
+
# flash-attn: https://github.com/Dao-AILab/flash-attention/releases
|
ms-swift/requirements/seq_parallel.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
xtuner
|
ms-swift/requirements/swanlab.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
swanlab
|
ms-swift/scripts/benchmark/config/tuner.json
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cmd": "sft",
|
| 3 |
+
"requirements":{
|
| 4 |
+
"gpu": "1",
|
| 5 |
+
"ddp": "1"
|
| 6 |
+
},
|
| 7 |
+
"eval_requirements": {
|
| 8 |
+
"gpu": "1"
|
| 9 |
+
},
|
| 10 |
+
"eval_dataset": ["ceval", "gsm8k", "arc"],
|
| 11 |
+
"args": {
|
| 12 |
+
"model": "Qwen/Qwen-7B-Chat",
|
| 13 |
+
"dataset": "iic/ms_agent",
|
| 14 |
+
"per_device_train_batch_size": 1,
|
| 15 |
+
"max_length": 2048,
|
| 16 |
+
"loss_scale": "react",
|
| 17 |
+
"gradient_accumulation_steps": 16,
|
| 18 |
+
"learning_rate": 5e-5,
|
| 19 |
+
"attn_impl": "flash_attn",
|
| 20 |
+
"eval_steps": 2000,
|
| 21 |
+
"save_steps": 2000,
|
| 22 |
+
"num_train_epochs": 2,
|
| 23 |
+
"gradient_checkpointing": true,
|
| 24 |
+
"weight_decay": 0.01,
|
| 25 |
+
"warmup_ratio": 0.03,
|
| 26 |
+
"save_total_limit": 2,
|
| 27 |
+
"logging_steps": 10
|
| 28 |
+
},
|
| 29 |
+
"experiment": [
|
| 30 |
+
{
|
| 31 |
+
"name": "lora",
|
| 32 |
+
"args": {
|
| 33 |
+
"train_type": "lora",
|
| 34 |
+
"lora_rank": 8,
|
| 35 |
+
"lora_alpha": 32
|
| 36 |
+
}
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"name": "lora+packing",
|
| 40 |
+
"args": {
|
| 41 |
+
"train_type": "lora",
|
| 42 |
+
"lora_rank": 8,
|
| 43 |
+
"lora_alpha": 32,
|
| 44 |
+
"packing": true,
|
| 45 |
+
"eval_steps": 200,
|
| 46 |
+
"save_steps": 200
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"name": "lora+packing+ddp",
|
| 51 |
+
"requirements":{
|
| 52 |
+
"gpu": "2",
|
| 53 |
+
"ddp": "2"
|
| 54 |
+
},
|
| 55 |
+
"args": {
|
| 56 |
+
"train_type": "lora",
|
| 57 |
+
"lora_rank": 8,
|
| 58 |
+
"lora_alpha": 32,
|
| 59 |
+
"packing": true,
|
| 60 |
+
"eval_steps": 100,
|
| 61 |
+
"save_steps": 100
|
| 62 |
+
}
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"name": "lora+packing+lazytokenize",
|
| 66 |
+
"args": {
|
| 67 |
+
"train_type": "lora",
|
| 68 |
+
"lora_rank": 8,
|
| 69 |
+
"lora_alpha": 32,
|
| 70 |
+
"packing": true,
|
| 71 |
+
"lazy_tokenize": true,
|
| 72 |
+
"eval_steps": 200,
|
| 73 |
+
"save_steps": 200
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "lora+",
|
| 78 |
+
"args": {
|
| 79 |
+
"train_type": "lora",
|
| 80 |
+
"lora_rank": 8,
|
| 81 |
+
"lora_alpha": 32,
|
| 82 |
+
"lorap_lr_ratio": 16.0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"name": "rslora",
|
| 87 |
+
"args": {
|
| 88 |
+
"train_type": "lora",
|
| 89 |
+
"lora_rank": 8,
|
| 90 |
+
"lora_alpha": 32,
|
| 91 |
+
"use_rslora": true
|
| 92 |
+
}
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"name": "dora",
|
| 96 |
+
"args": {
|
| 97 |
+
"train_type": "lora",
|
| 98 |
+
"lora_rank": 8,
|
| 99 |
+
"lora_alpha": 32,
|
| 100 |
+
"use_dora": true
|
| 101 |
+
}
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"name": "lora+neftune",
|
| 105 |
+
"args": {
|
| 106 |
+
"train_type": "lora",
|
| 107 |
+
"lora_rank": 8,
|
| 108 |
+
"lora_alpha": 32,
|
| 109 |
+
"neftune_noise_alpha": 15.0
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"name": "llamapro",
|
| 114 |
+
"args": {
|
| 115 |
+
"train_type": "llamapro",
|
| 116 |
+
"llamapro_num_new_blocks": "4"
|
| 117 |
+
}
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"name": "full",
|
| 121 |
+
"requirements":{
|
| 122 |
+
"gpu": "1",
|
| 123 |
+
"ddp": "1"
|
| 124 |
+
},
|
| 125 |
+
"args": {
|
| 126 |
+
"train_type": "full"
|
| 127 |
+
}
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"name": "reft",
|
| 131 |
+
"requirements":{
|
| 132 |
+
"gpu": "1",
|
| 133 |
+
"ddp": "1"
|
| 134 |
+
},
|
| 135 |
+
"args": {
|
| 136 |
+
"train_type": "reft",
|
| 137 |
+
"gradient_checkpointing": "false",
|
| 138 |
+
"loss_scale": "default"
|
| 139 |
+
}
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"name": "full+galore128+quantize",
|
| 143 |
+
"requirements":{
|
| 144 |
+
"gpu": "1",
|
| 145 |
+
"ddp": "1"
|
| 146 |
+
},
|
| 147 |
+
"args": {
|
| 148 |
+
"train_type": "full",
|
| 149 |
+
"use_galore": "true",
|
| 150 |
+
"galore_rank": "128",
|
| 151 |
+
"galore_update_proj_gap": "200",
|
| 152 |
+
"galore_optim_per_parameter": "false",
|
| 153 |
+
"galore_with_embedding": "false",
|
| 154 |
+
"galore_quantization": "true"
|
| 155 |
+
}
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"name": "full+galore128+quantize+proj_quant",
|
| 159 |
+
"requirements":{
|
| 160 |
+
"gpu": "1",
|
| 161 |
+
"ddp": "1"
|
| 162 |
+
},
|
| 163 |
+
"args": {
|
| 164 |
+
"train_type": "full",
|
| 165 |
+
"use_galore": "true",
|
| 166 |
+
"galore_rank": "128",
|
| 167 |
+
"galore_update_proj_gap": "200",
|
| 168 |
+
"galore_optim_per_parameter": "false",
|
| 169 |
+
"galore_with_embedding": "false",
|
| 170 |
+
"galore_quantization": "true",
|
| 171 |
+
"galore_proj_quant": "true"
|
| 172 |
+
}
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"name": "full+galore128",
|
| 176 |
+
"requirements":{
|
| 177 |
+
"gpu": "1",
|
| 178 |
+
"ddp": "1"
|
| 179 |
+
},
|
| 180 |
+
"args": {
|
| 181 |
+
"train_type": "full",
|
| 182 |
+
"use_galore": "true",
|
| 183 |
+
"galore_rank": "128",
|
| 184 |
+
"galore_update_proj_gap": "200",
|
| 185 |
+
"galore_optim_per_parameter": "false",
|
| 186 |
+
"galore_with_embedding": "false"
|
| 187 |
+
}
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"name": "full+galore64",
|
| 191 |
+
"requirements":{
|
| 192 |
+
"gpu": "1",
|
| 193 |
+
"ddp": "1"
|
| 194 |
+
},
|
| 195 |
+
"args": {
|
| 196 |
+
"train_type": "full",
|
| 197 |
+
"use_galore": "true",
|
| 198 |
+
"galore_rank": "64",
|
| 199 |
+
"galore_update_proj_gap": "200",
|
| 200 |
+
"galore_optim_per_parameter": "false",
|
| 201 |
+
"galore_with_embedding": "false"
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"name": "full+galore32",
|
| 206 |
+
"requirements":{
|
| 207 |
+
"gpu": "1",
|
| 208 |
+
"ddp": "1"
|
| 209 |
+
},
|
| 210 |
+
"args": {
|
| 211 |
+
"train_type": "full",
|
| 212 |
+
"use_galore": "true",
|
| 213 |
+
"galore_rank": "32",
|
| 214 |
+
"galore_update_proj_gap": "200",
|
| 215 |
+
"galore_optim_per_parameter": "false",
|
| 216 |
+
"galore_with_embedding": "false"
|
| 217 |
+
}
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"name": "full+galore_emb",
|
| 221 |
+
"requirements":{
|
| 222 |
+
"gpu": "1",
|
| 223 |
+
"ddp": "1"
|
| 224 |
+
},
|
| 225 |
+
"args": {
|
| 226 |
+
"train_type": "full",
|
| 227 |
+
"use_galore": "true",
|
| 228 |
+
"galore_rank": "128",
|
| 229 |
+
"galore_update_proj_gap": "200",
|
| 230 |
+
"galore_optim_per_parameter": "false",
|
| 231 |
+
"galore_with_embedding": "true"
|
| 232 |
+
}
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"name": "full+galore_perparam",
|
| 236 |
+
"requirements":{
|
| 237 |
+
"gpu": "1",
|
| 238 |
+
"ddp": "1"
|
| 239 |
+
},
|
| 240 |
+
"args": {
|
| 241 |
+
"train_type": "full",
|
| 242 |
+
"use_galore": "true",
|
| 243 |
+
"galore_rank": "128",
|
| 244 |
+
"galore_update_proj_gap": "200",
|
| 245 |
+
"galore_optim_per_parameter": "true",
|
| 246 |
+
"galore_with_embedding": "false"
|
| 247 |
+
}
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"name": "adalora",
|
| 251 |
+
"args": {
|
| 252 |
+
"train_type": "adalora",
|
| 253 |
+
"lora_rank": 8,
|
| 254 |
+
"lora_alpha": 32
|
| 255 |
+
}
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"name": "adapter",
|
| 259 |
+
"args": {
|
| 260 |
+
"train_type": "adapter"
|
| 261 |
+
}
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"name": "full+lisa_2",
|
| 265 |
+
"info": "lisa 2layers + full",
|
| 266 |
+
"args": {
|
| 267 |
+
"train_type": "full",
|
| 268 |
+
"lisa_activated_layers": 2,
|
| 269 |
+
"lisa_step_interval": 20
|
| 270 |
+
}
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"name": "full+lisa_4",
|
| 274 |
+
"info": "lisa 4layers + full",
|
| 275 |
+
"args": {
|
| 276 |
+
"train_type": "full",
|
| 277 |
+
"lisa_activated_layers": 4,
|
| 278 |
+
"lisa_step_interval": 20
|
| 279 |
+
}
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"name": "unsloth+lora+q4",
|
| 283 |
+
"info": "unsloth lora quantization bit 4",
|
| 284 |
+
"args": {
|
| 285 |
+
"train_type": "lora",
|
| 286 |
+
"tuner_backend": "unsloth",
|
| 287 |
+
"quantization_bit": 4,
|
| 288 |
+
"model": "LLM-Research/Meta-Llama-3-8B-Instruct"
|
| 289 |
+
}
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"name": "unsloth+full",
|
| 293 |
+
"info": "unsloth full",
|
| 294 |
+
"args": {
|
| 295 |
+
"train_type": "full",
|
| 296 |
+
"tuner_backend": "unsloth",
|
| 297 |
+
"model_type": "LLM-Research/Meta-Llama-3-8B-Instruct"
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
]
|
| 301 |
+
}
|
ms-swift/scripts/benchmark/exp.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import os.path
|
| 5 |
+
|
| 6 |
+
from exp_utils import ExpManager, find_all_config
|
| 7 |
+
|
| 8 |
+
from swift.utils import *
|
| 9 |
+
|
| 10 |
+
logger = get_logger()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def parse_args():
|
| 14 |
+
parser = argparse.ArgumentParser(description='Simple args for swift experiments.')
|
| 15 |
+
parser.add_argument(
|
| 16 |
+
'--config',
|
| 17 |
+
type=str,
|
| 18 |
+
default=None,
|
| 19 |
+
required=True,
|
| 20 |
+
help='The experiment config file',
|
| 21 |
+
)
|
| 22 |
+
parser.add_argument(
|
| 23 |
+
'--save_dir',
|
| 24 |
+
type=str,
|
| 25 |
+
default='./experiment',
|
| 26 |
+
required=False,
|
| 27 |
+
help='The experiment output folder',
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
args = parser.parse_args()
|
| 31 |
+
return args
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def llm_exp():
|
| 35 |
+
args = parse_args()
|
| 36 |
+
config: str = args.config
|
| 37 |
+
config = config.split(',')
|
| 38 |
+
os.makedirs(args.save_dir, exist_ok=True)
|
| 39 |
+
all_configs = []
|
| 40 |
+
if not isinstance(config, list):
|
| 41 |
+
config = [config]
|
| 42 |
+
for dir_or_file in config:
|
| 43 |
+
all_configs.extend(find_all_config(dir_or_file))
|
| 44 |
+
args.config = all_configs
|
| 45 |
+
exp_manager = ExpManager()
|
| 46 |
+
exp_manager.begin(args)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if __name__ == '__main__':
|
| 50 |
+
llm_exp()
|
ms-swift/scripts/benchmark/generate_report.py
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
import dataclasses
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from swift.llm.template import split_str_parts_by
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class ModelOutput:
|
| 15 |
+
|
| 16 |
+
group: str = None
|
| 17 |
+
|
| 18 |
+
name: str = None
|
| 19 |
+
|
| 20 |
+
cmd: str = None
|
| 21 |
+
|
| 22 |
+
requirements: Dict[str, str] = dataclasses.field(default_factory=dict)
|
| 23 |
+
|
| 24 |
+
args: Dict[str, Any] = dataclasses.field(default_factory=dict)
|
| 25 |
+
|
| 26 |
+
memory: str = None
|
| 27 |
+
|
| 28 |
+
train_time: float = None
|
| 29 |
+
|
| 30 |
+
train_samples: int = None
|
| 31 |
+
|
| 32 |
+
train_samples_per_second: float = None
|
| 33 |
+
|
| 34 |
+
last_model_checkpoint: str = None
|
| 35 |
+
|
| 36 |
+
best_model_checkpoint: str = None
|
| 37 |
+
|
| 38 |
+
best_metric: Any = None
|
| 39 |
+
|
| 40 |
+
global_step: int = None
|
| 41 |
+
|
| 42 |
+
num_total_parameters: float = None
|
| 43 |
+
|
| 44 |
+
num_trainable_parameters: float = None
|
| 45 |
+
|
| 46 |
+
num_buffers: float = None
|
| 47 |
+
|
| 48 |
+
trainable_parameters_percentage: float = None
|
| 49 |
+
|
| 50 |
+
train_dataset_info: str = None
|
| 51 |
+
|
| 52 |
+
val_dataset_info: str = None
|
| 53 |
+
|
| 54 |
+
train_create_time: float = None
|
| 55 |
+
|
| 56 |
+
eval_tokens: int = None
|
| 57 |
+
|
| 58 |
+
eval_time: float = None
|
| 59 |
+
|
| 60 |
+
reports: Dict[str, Any] = None
|
| 61 |
+
|
| 62 |
+
train_loss: float = None
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def tuner_hyper_params(self):
|
| 66 |
+
hyper_params = ''
|
| 67 |
+
args = self.args
|
| 68 |
+
if 'sft_type' not in args:
|
| 69 |
+
return ''
|
| 70 |
+
if args['sft_type'] in ('lora', 'adalora', 'longlora'):
|
| 71 |
+
if 'lora_rank' in args:
|
| 72 |
+
hyper_params += f'rank={args["lora_rank"]}/' \
|
| 73 |
+
f'target={args["lora_target_modules"]}/' \
|
| 74 |
+
f'alpha={args["lora_alpha"]}/' \
|
| 75 |
+
f'lr_ratio={args.get("lora_lr_ratio", None)}/' \
|
| 76 |
+
f'use_rslora={args.get("use_rslora", False)}/' \
|
| 77 |
+
f'use_dora={args.get("use_dora", False)}'
|
| 78 |
+
else:
|
| 79 |
+
hyper_params = ''
|
| 80 |
+
if args['sft_type'] == 'full':
|
| 81 |
+
if 'use_galore' in args and args['use_galore'] == 'true':
|
| 82 |
+
hyper_params += f'galore_rank={args["galore_rank"]}/' \
|
| 83 |
+
f'galore_per_parameter={args["galore_optim_per_parameter"]}/' \
|
| 84 |
+
f'galore_with_embedding={args["galore_with_embedding"]}/'
|
| 85 |
+
if args['sft_type'] == 'llamapro':
|
| 86 |
+
hyper_params += f'num_blocks={args["llamapro_num_new_blocks"]}/'
|
| 87 |
+
if 'neftune_noise_alpha' in args and args['neftune_noise_alpha']:
|
| 88 |
+
hyper_params += f'neftune_noise_alpha={args["neftune_noise_alpha"]}/'
|
| 89 |
+
|
| 90 |
+
if hyper_params.endswith('/'):
|
| 91 |
+
hyper_params = hyper_params[:-1]
|
| 92 |
+
return hyper_params
|
| 93 |
+
|
| 94 |
+
@property
|
| 95 |
+
def hyper_parameters(self):
|
| 96 |
+
if 'learning_rate' not in self.args:
|
| 97 |
+
return ''
|
| 98 |
+
return f'lr={self.args["learning_rate"]}/' \
|
| 99 |
+
f'epoch={self.args["num_train_epochs"]}'
|
| 100 |
+
|
| 101 |
+
@property
|
| 102 |
+
def train_speed(self):
|
| 103 |
+
if self.train_samples_per_second:
|
| 104 |
+
return f'{self.train_samples_per_second:.2f}({self.train_samples} samples/{self.train_time:.2f} seconds)'
|
| 105 |
+
else:
|
| 106 |
+
return ''
|
| 107 |
+
|
| 108 |
+
@property
|
| 109 |
+
def infer_speed(self):
|
| 110 |
+
if self.eval_tokens:
|
| 111 |
+
return f'{self.eval_tokens / self.eval_time:.2f}({self.eval_tokens} tokens/{self.eval_time:.2f} seconds)'
|
| 112 |
+
return ''
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def generate_sft_report(outputs: List[ModelOutput]):
|
| 116 |
+
gsm8k_accs = []
|
| 117 |
+
arc_accs = []
|
| 118 |
+
ceval_accs = []
|
| 119 |
+
for output in outputs:
|
| 120 |
+
gsm8k_acc = None
|
| 121 |
+
arc_acc = None
|
| 122 |
+
ceval_acc = None
|
| 123 |
+
for report in (output.reports or []):
|
| 124 |
+
if report['name'] == 'gsm8k':
|
| 125 |
+
gsm8k_acc = report['score']
|
| 126 |
+
if report['name'] == 'arc':
|
| 127 |
+
arc_acc = report['score']
|
| 128 |
+
if report['name'] == 'ceval':
|
| 129 |
+
ceval_acc = report['score']
|
| 130 |
+
gsm8k_accs.append(gsm8k_acc)
|
| 131 |
+
arc_accs.append(arc_acc)
|
| 132 |
+
ceval_accs.append(ceval_acc)
|
| 133 |
+
|
| 134 |
+
tab = '| exp_name | model_type | dataset | ms-bench mix ratio | tuner | tuner_params | trainable params(M) | flash_attn | gradient_checkpointing | hypers | memory | train speed(samples/s) | infer speed(tokens/s) | train_loss | eval_loss | gsm8k weighted acc | arc weighted acc | ceval weighted acc |\n' \
|
| 135 |
+
'| -------- | ---------- | ------- | -------------------| ----- | ------------ | ------------------- | -----------| ---------------------- | ------ | ------ | ---------------------- | --------------------- | ---------- | --------- | ------------------ | ---------------- | ------------------ |\n' # noqa
|
| 136 |
+
min_best_metric = 999.
|
| 137 |
+
min_train_loss = 999.
|
| 138 |
+
if outputs:
|
| 139 |
+
min_best_metric = min([output.best_metric or 999. for output in outputs])
|
| 140 |
+
min_train_loss = min([output.train_loss or 999. for output in outputs])
|
| 141 |
+
|
| 142 |
+
max_gsm8k = 0.0
|
| 143 |
+
if gsm8k_accs:
|
| 144 |
+
max_gsm8k = max([gsm8k or 0. for gsm8k in gsm8k_accs])
|
| 145 |
+
|
| 146 |
+
max_arc = 0.0
|
| 147 |
+
if arc_accs:
|
| 148 |
+
max_arc = max([arc or 0. for arc in arc_accs])
|
| 149 |
+
|
| 150 |
+
max_ceval = 0.0
|
| 151 |
+
if ceval_accs:
|
| 152 |
+
max_ceval = max([ceval or 0. for ceval in ceval_accs])
|
| 153 |
+
|
| 154 |
+
for output, gsm8k_acc, arc_acc, ceval_acc in zip(outputs, gsm8k_accs, arc_accs, ceval_accs):
|
| 155 |
+
use_flash_attn = output.args.get('use_flash_attn', '')
|
| 156 |
+
use_gc = output.args.get('gradient_checkpointing', '')
|
| 157 |
+
memory = output.memory
|
| 158 |
+
train_speed = output.train_speed
|
| 159 |
+
infer_speed = output.infer_speed
|
| 160 |
+
|
| 161 |
+
is_best_metric = np.isclose(min_best_metric, output.best_metric or 999.0)
|
| 162 |
+
is_best_loss = np.isclose(min_train_loss, output.train_loss or 999.0)
|
| 163 |
+
is_best_gsm8k = np.isclose(max_gsm8k, gsm8k_acc or 0.0)
|
| 164 |
+
is_best_arc = np.isclose(max_arc, arc_acc or 0.0)
|
| 165 |
+
is_best_ceval = np.isclose(max_ceval, ceval_acc or 0.0)
|
| 166 |
+
|
| 167 |
+
if not is_best_metric:
|
| 168 |
+
best_metric = '' if not output.best_metric else f'{output.best_metric:.2f}'
|
| 169 |
+
else:
|
| 170 |
+
best_metric = '' if not output.best_metric else f'**{output.best_metric:.2f}**'
|
| 171 |
+
|
| 172 |
+
if not is_best_loss:
|
| 173 |
+
train_loss = '' if not output.train_loss else f'{output.train_loss:.2f}'
|
| 174 |
+
else:
|
| 175 |
+
train_loss = '' if not output.train_loss else f'**{output.train_loss:.2f}**'
|
| 176 |
+
|
| 177 |
+
if not is_best_gsm8k:
|
| 178 |
+
gsm8k_acc = '' if not gsm8k_acc else f'{gsm8k_acc:.3f}'
|
| 179 |
+
else:
|
| 180 |
+
gsm8k_acc = '' if not gsm8k_acc else f'**{gsm8k_acc:.3f}**'
|
| 181 |
+
|
| 182 |
+
if not is_best_arc:
|
| 183 |
+
arc_acc = '' if not arc_acc else f'{arc_acc:.3f}'
|
| 184 |
+
else:
|
| 185 |
+
arc_acc = '' if not arc_acc else f'**{arc_acc:.3f}**'
|
| 186 |
+
|
| 187 |
+
if not is_best_ceval:
|
| 188 |
+
ceval_acc = '' if not ceval_acc else f'{ceval_acc:.3f}'
|
| 189 |
+
else:
|
| 190 |
+
ceval_acc = '' if not ceval_acc else f'**{ceval_acc:.3f}**'
|
| 191 |
+
|
| 192 |
+
line = f'|{output.name}|' \
|
| 193 |
+
f'{output.args["model_type"]}|' \
|
| 194 |
+
f'{output.args.get("dataset")}|' \
|
| 195 |
+
f'{output.args.get("train_dataset_mix_ratio", 0.)}|' \
|
| 196 |
+
f'{output.args.get("sft_type")}|' \
|
| 197 |
+
f'{output.tuner_hyper_params}|' \
|
| 198 |
+
f'{output.num_trainable_parameters}({output.trainable_parameters_percentage})|' \
|
| 199 |
+
f'{use_flash_attn}|' \
|
| 200 |
+
f'{use_gc}|' \
|
| 201 |
+
f'{output.hyper_parameters}|' \
|
| 202 |
+
f'{memory}|' \
|
| 203 |
+
f'{train_speed}|' \
|
| 204 |
+
f'{infer_speed}|' \
|
| 205 |
+
f'{best_metric}|' \
|
| 206 |
+
f'{train_loss}|' \
|
| 207 |
+
f'{gsm8k_acc}|' \
|
| 208 |
+
f'{arc_acc}|' \
|
| 209 |
+
f'{ceval_acc}|\n'
|
| 210 |
+
tab += line
|
| 211 |
+
return tab
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def generate_export_report(outputs: List[ModelOutput]):
|
| 215 |
+
tab = '| exp_name | model_type | calibration dataset | quantization method | quantization bits | infer speed(tokens/s) | gsm8k weighted acc | arc weighted acc | ceval weighted acc |\n' \
|
| 216 |
+
'| -------- | ---------- | ------------------- | ------------------- | ----------------- | --------------------- | ------------------ | ---------------- | ------------------ |\n' # noqa
|
| 217 |
+
|
| 218 |
+
gsm8k_accs = []
|
| 219 |
+
arc_accs = []
|
| 220 |
+
ceval_accs = []
|
| 221 |
+
for output in outputs:
|
| 222 |
+
gsm8k_acc = None
|
| 223 |
+
arc_acc = None
|
| 224 |
+
ceval_acc = None
|
| 225 |
+
for report in (output.reports or []):
|
| 226 |
+
if report['name'] == 'gsm8k':
|
| 227 |
+
gsm8k_acc = report['score']
|
| 228 |
+
if report['name'] == 'arc':
|
| 229 |
+
arc_acc = report['score']
|
| 230 |
+
if report['name'] == 'ceval':
|
| 231 |
+
ceval_acc = report['score']
|
| 232 |
+
gsm8k_accs.append(gsm8k_acc)
|
| 233 |
+
arc_accs.append(arc_acc)
|
| 234 |
+
ceval_accs.append(ceval_acc)
|
| 235 |
+
|
| 236 |
+
max_gsm8k = 0.0
|
| 237 |
+
if gsm8k_accs:
|
| 238 |
+
max_gsm8k = max([gsm8k or 0. for gsm8k in gsm8k_accs])
|
| 239 |
+
|
| 240 |
+
max_arc = 0.0
|
| 241 |
+
if arc_accs:
|
| 242 |
+
max_arc = max([arc or 0. for arc in arc_accs])
|
| 243 |
+
|
| 244 |
+
max_ceval = 0.0
|
| 245 |
+
if ceval_accs:
|
| 246 |
+
max_ceval = max([ceval or 0. for ceval in ceval_accs])
|
| 247 |
+
|
| 248 |
+
for output, gsm8k_acc, arc_acc, ceval_acc in zip(outputs, gsm8k_accs, arc_accs, ceval_accs):
|
| 249 |
+
infer_speed = output.infer_speed
|
| 250 |
+
is_best_gsm8k = np.isclose(max_gsm8k, gsm8k_acc or 0.0)
|
| 251 |
+
is_best_arc = np.isclose(max_arc, arc_acc or 0.0)
|
| 252 |
+
is_best_ceval = np.isclose(max_ceval, ceval_acc or 0.0)
|
| 253 |
+
|
| 254 |
+
if not is_best_gsm8k:
|
| 255 |
+
gsm8k_acc = '' if not gsm8k_acc else f'{gsm8k_acc:.3f}'
|
| 256 |
+
else:
|
| 257 |
+
gsm8k_acc = '' if not gsm8k_acc else f'**{gsm8k_acc:.3f}**'
|
| 258 |
+
|
| 259 |
+
if not is_best_arc:
|
| 260 |
+
arc_acc = '' if not arc_acc else f'{arc_acc:.3f}'
|
| 261 |
+
else:
|
| 262 |
+
arc_acc = '' if not arc_acc else f'**{arc_acc:.3f}**'
|
| 263 |
+
|
| 264 |
+
if not is_best_ceval:
|
| 265 |
+
ceval_acc = '' if not ceval_acc else f'{ceval_acc:.3f}'
|
| 266 |
+
else:
|
| 267 |
+
ceval_acc = '' if not ceval_acc else f'**{ceval_acc:.3f}**'
|
| 268 |
+
|
| 269 |
+
if output.train_dataset_info:
|
| 270 |
+
dataset_info = f'{output.args["dataset"]}/{output.train_dataset_info}'
|
| 271 |
+
else:
|
| 272 |
+
dataset_info = f'{output.args["dataset"]}'
|
| 273 |
+
line = f'|{output.name}|' \
|
| 274 |
+
f'{output.args["model_type"]}|' \
|
| 275 |
+
f'{dataset_info}|' \
|
| 276 |
+
f'{output.args["quant_method"]}|' \
|
| 277 |
+
f'{output.args["quant_bits"]}|' \
|
| 278 |
+
f'{infer_speed}|' \
|
| 279 |
+
f'{gsm8k_acc}|' \
|
| 280 |
+
f'{arc_acc}|' \
|
| 281 |
+
f'{ceval_acc}|\n'
|
| 282 |
+
tab += line
|
| 283 |
+
return tab
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def parse_output(file):
|
| 287 |
+
with open(file, 'r', encoding='utf-8') as f:
|
| 288 |
+
content = json.load(f)
|
| 289 |
+
|
| 290 |
+
name = content['name']
|
| 291 |
+
group = content['group']
|
| 292 |
+
cmd = content['cmd']
|
| 293 |
+
requirements = content['requirements']
|
| 294 |
+
args = content['args']
|
| 295 |
+
create_time = float(content.get('create_time') or 0)
|
| 296 |
+
content = content['record']
|
| 297 |
+
if cmd == 'export':
|
| 298 |
+
best_model_checkpoint = content['best_model_checkpoint']
|
| 299 |
+
eval_tokens = 0
|
| 300 |
+
eval_time = 0.0
|
| 301 |
+
eval_result = None
|
| 302 |
+
if 'eval_result' in content:
|
| 303 |
+
eval_result = content['eval_result']
|
| 304 |
+
eval_tokens = eval_result['generation_info']['tokens']
|
| 305 |
+
eval_time = eval_result['generation_info']['time']
|
| 306 |
+
eval_result = eval_result['report']
|
| 307 |
+
return ModelOutput(
|
| 308 |
+
group=group,
|
| 309 |
+
name=name,
|
| 310 |
+
cmd=cmd,
|
| 311 |
+
requirements=requirements,
|
| 312 |
+
args=args,
|
| 313 |
+
best_model_checkpoint=best_model_checkpoint,
|
| 314 |
+
eval_time=eval_time,
|
| 315 |
+
eval_tokens=eval_tokens,
|
| 316 |
+
reports=eval_result,
|
| 317 |
+
)
|
| 318 |
+
else:
|
| 319 |
+
memory = None
|
| 320 |
+
train_time = None
|
| 321 |
+
train_samples = None
|
| 322 |
+
train_samples_per_second = None
|
| 323 |
+
last_model_checkpoint = None
|
| 324 |
+
best_model_checkpoint = None
|
| 325 |
+
best_metric = None
|
| 326 |
+
global_step = None
|
| 327 |
+
train_dataset_info = None
|
| 328 |
+
val_dataset_info = None
|
| 329 |
+
num_trainable_parameters = None
|
| 330 |
+
num_buffers = None
|
| 331 |
+
trainable_parameters_percentage = None
|
| 332 |
+
num_total_parameters = None
|
| 333 |
+
train_loss = None
|
| 334 |
+
if 'memory' in content:
|
| 335 |
+
memory = content['memory']
|
| 336 |
+
memory = '/'.join(memory.values())
|
| 337 |
+
if 'train_time' in content:
|
| 338 |
+
train_time = content['train_time']['train_runtime']
|
| 339 |
+
train_samples = content['train_time']['n_train_samples']
|
| 340 |
+
train_samples_per_second = content['train_time']['train_samples_per_second']
|
| 341 |
+
if 'last_model_checkpoint' in content:
|
| 342 |
+
last_model_checkpoint = content['last_model_checkpoint']
|
| 343 |
+
if 'best_model_checkpoint' in content:
|
| 344 |
+
best_model_checkpoint = content['best_model_checkpoint']
|
| 345 |
+
if 'best_metric' in content:
|
| 346 |
+
best_metric = content['best_metric']
|
| 347 |
+
if 'log_history' in content:
|
| 348 |
+
train_loss = content['log_history'][-1]['train_loss']
|
| 349 |
+
if 'global_step' in content:
|
| 350 |
+
global_step = content['global_step']
|
| 351 |
+
if 'dataset_info' in content:
|
| 352 |
+
train_dataset_info = content['dataset_info'].get('train_dataset')
|
| 353 |
+
val_dataset_info = content['dataset_info'].get('val_dataset')
|
| 354 |
+
if 'model_info' in content:
|
| 355 |
+
# model_info like: SwiftModel: 6758.4041M Params (19.9885M Trainable [0.2958%]), 16.7793M Buffers.
|
| 356 |
+
str_dict = split_str_parts_by(content['model_info'], [
|
| 357 |
+
'SwiftModel:', 'CausalLM:', 'Seq2SeqLM:', 'LMHeadModel:', 'M Params (', 'M Trainable [', ']), ',
|
| 358 |
+
'M Buffers.'
|
| 359 |
+
])
|
| 360 |
+
str_dict = {c['key']: c['content'] for c in str_dict}
|
| 361 |
+
if 'SwiftModel:' in str_dict:
|
| 362 |
+
num_total_parameters = float(str_dict['SwiftModel:'])
|
| 363 |
+
elif 'CausalLM:' in str_dict:
|
| 364 |
+
num_total_parameters = float(str_dict['CausalLM:'])
|
| 365 |
+
elif 'Seq2SeqLM:' in str_dict:
|
| 366 |
+
num_total_parameters = float(str_dict['Seq2SeqLM:'])
|
| 367 |
+
elif 'LMHeadModel:' in str_dict:
|
| 368 |
+
num_total_parameters = float(str_dict['LMHeadModel:'])
|
| 369 |
+
num_trainable_parameters = float(str_dict['M Params ('])
|
| 370 |
+
num_buffers = float(str_dict[']), '])
|
| 371 |
+
trainable_parameters_percentage = str_dict['M Trainable [']
|
| 372 |
+
|
| 373 |
+
eval_tokens = 0
|
| 374 |
+
eval_time = 0.0
|
| 375 |
+
eval_result = None
|
| 376 |
+
if 'eval_result' in content:
|
| 377 |
+
eval_result = content['eval_result']
|
| 378 |
+
eval_tokens = eval_result['generation_info']['tokens']
|
| 379 |
+
eval_time = eval_result['generation_info']['time']
|
| 380 |
+
eval_result = eval_result['report']
|
| 381 |
+
|
| 382 |
+
return ModelOutput(
|
| 383 |
+
group=group,
|
| 384 |
+
name=name,
|
| 385 |
+
cmd=cmd,
|
| 386 |
+
requirements=requirements,
|
| 387 |
+
args=args,
|
| 388 |
+
memory=memory,
|
| 389 |
+
train_time=train_time,
|
| 390 |
+
train_samples=train_samples,
|
| 391 |
+
train_samples_per_second=train_samples_per_second,
|
| 392 |
+
last_model_checkpoint=last_model_checkpoint,
|
| 393 |
+
best_model_checkpoint=best_model_checkpoint,
|
| 394 |
+
best_metric=best_metric,
|
| 395 |
+
global_step=global_step,
|
| 396 |
+
train_dataset_info=train_dataset_info,
|
| 397 |
+
val_dataset_info=val_dataset_info,
|
| 398 |
+
train_create_time=create_time,
|
| 399 |
+
num_total_parameters=num_total_parameters,
|
| 400 |
+
num_trainable_parameters=num_trainable_parameters,
|
| 401 |
+
num_buffers=num_buffers,
|
| 402 |
+
trainable_parameters_percentage=trainable_parameters_percentage,
|
| 403 |
+
eval_time=eval_time,
|
| 404 |
+
eval_tokens=eval_tokens,
|
| 405 |
+
reports=eval_result,
|
| 406 |
+
train_loss=train_loss,
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def generate_reports():
|
| 411 |
+
outputs = []
|
| 412 |
+
for dirs, _, files in os.walk('./experiment'):
|
| 413 |
+
for file in files:
|
| 414 |
+
abs_file = os.path.join(dirs, file)
|
| 415 |
+
if not abs_file.endswith('.json') or 'ipynb' in abs_file:
|
| 416 |
+
continue
|
| 417 |
+
|
| 418 |
+
outputs.append(parse_output(abs_file))
|
| 419 |
+
|
| 420 |
+
all_groups = set([output.group for output in outputs])
|
| 421 |
+
for group in all_groups:
|
| 422 |
+
group_outputs = [output for output in outputs if output.group == group]
|
| 423 |
+
print(f'=================Printing the sft cmd result of exp {group}==================\n\n')
|
| 424 |
+
print(generate_sft_report([output for output in group_outputs if output.cmd in ('sft', 'eval')]))
|
| 425 |
+
# print(f'=================Printing the dpo result of exp {group}==================')
|
| 426 |
+
# print(generate_dpo_report([output for output in outputs if output.cmd == 'dpo']))
|
| 427 |
+
print(f'=================Printing the export cmd result of exp {group}==================\n\n')
|
| 428 |
+
print(generate_export_report([output for output in group_outputs if output.cmd == 'export']))
|
| 429 |
+
print('=================Printing done==================\n\n')
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
if __name__ == '__main__':
|
| 433 |
+
generate_reports()
|
ms-swift/scripts/utils/run_dataset_info.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from swift.llm import DATASET_MAPPING, EncodePreprocessor, get_model_tokenizer, get_template, load_dataset
|
| 7 |
+
from swift.utils import stat_array
|
| 8 |
+
|
| 9 |
+
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_cache_mapping(fpath):
|
| 13 |
+
with open(fpath, 'r', encoding='utf-8') as f:
|
| 14 |
+
text = f.read()
|
| 15 |
+
idx = text.find('| Dataset ID |')
|
| 16 |
+
text = text[idx:]
|
| 17 |
+
text_list = text.split('\n')[2:]
|
| 18 |
+
cache_mapping = {} # dataset_id -> (dataset_size, stat)
|
| 19 |
+
for text in text_list:
|
| 20 |
+
if not text:
|
| 21 |
+
continue
|
| 22 |
+
items = text.split('|')
|
| 23 |
+
key = items[1] if items[1] != '-' else items[6]
|
| 24 |
+
key = re.search(r'\[(.+?)\]', key).group(1)
|
| 25 |
+
stat = items[3:5]
|
| 26 |
+
if stat[0] == '-':
|
| 27 |
+
stat = ('huge dataset', '-')
|
| 28 |
+
cache_mapping[key] = stat
|
| 29 |
+
return cache_mapping
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_dataset_id(key):
|
| 33 |
+
for dataset_id in key:
|
| 34 |
+
if dataset_id is not None:
|
| 35 |
+
break
|
| 36 |
+
return dataset_id
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def run_dataset(key, template, cache_mapping):
|
| 40 |
+
ms_id, hf_id, _ = key
|
| 41 |
+
dataset_meta = DATASET_MAPPING[key]
|
| 42 |
+
tags = ', '.join(tag for tag in dataset_meta.tags) or '-'
|
| 43 |
+
dataset_id = ms_id or hf_id
|
| 44 |
+
use_hf = ms_id is None
|
| 45 |
+
if ms_id is not None:
|
| 46 |
+
ms_id = f'[{ms_id}](https://modelscope.cn/datasets/{ms_id})'
|
| 47 |
+
else:
|
| 48 |
+
ms_id = '-'
|
| 49 |
+
if hf_id is not None:
|
| 50 |
+
hf_id = f'[{hf_id}](https://huggingface.co/datasets/{hf_id})'
|
| 51 |
+
else:
|
| 52 |
+
hf_id = '-'
|
| 53 |
+
subsets = '<br>'.join(subset.name for subset in dataset_meta.subsets)
|
| 54 |
+
|
| 55 |
+
if dataset_meta.huge_dataset:
|
| 56 |
+
dataset_size = 'huge dataset'
|
| 57 |
+
stat_str = '-'
|
| 58 |
+
elif dataset_id in cache_mapping:
|
| 59 |
+
dataset_size, stat_str = cache_mapping[dataset_id]
|
| 60 |
+
else:
|
| 61 |
+
num_proc = 4
|
| 62 |
+
dataset, _ = load_dataset(f'{dataset_id}:all', strict=False, num_proc=num_proc, use_hf=use_hf)
|
| 63 |
+
dataset_size = len(dataset)
|
| 64 |
+
random_state = np.random.RandomState(42)
|
| 65 |
+
idx_list = random_state.choice(dataset_size, size=min(dataset_size, 100000), replace=False)
|
| 66 |
+
encoded_dataset = EncodePreprocessor(template)(dataset.select(idx_list), num_proc=num_proc)
|
| 67 |
+
|
| 68 |
+
input_ids = encoded_dataset['input_ids']
|
| 69 |
+
token_len = [len(tokens) for tokens in input_ids]
|
| 70 |
+
stat = stat_array(token_len)[0]
|
| 71 |
+
stat_str = f"{stat['mean']:.1f}±{stat['std']:.1f}, min={stat['min']}, max={stat['max']}"
|
| 72 |
+
|
| 73 |
+
return f'|{ms_id}|{subsets}|{dataset_size}|{stat_str}|{tags}|{hf_id}|'
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def write_dataset_info() -> None:
|
| 77 |
+
fpaths = ['docs/source/Instruction/支持的模型和数据集.md', 'docs/source_en/Instruction/Supported-models-and-datasets.md']
|
| 78 |
+
cache_mapping = get_cache_mapping(fpaths[0])
|
| 79 |
+
res_text_list = []
|
| 80 |
+
res_text_list.append('| Dataset ID | Subset Name | Dataset Size | Statistic (token) | Tags | HF Dataset ID |')
|
| 81 |
+
res_text_list.append('| ---------- | ----------- | -------------| ------------------| ---- | ------------- |')
|
| 82 |
+
|
| 83 |
+
all_keys = list(DATASET_MAPPING.keys())
|
| 84 |
+
all_keys = sorted(all_keys, key=lambda x: get_dataset_id(x))
|
| 85 |
+
_, tokenizer = get_model_tokenizer('Qwen/Qwen2.5-7B-Instruct', load_model=False)
|
| 86 |
+
template = get_template(tokenizer.model_meta.template, tokenizer)
|
| 87 |
+
try:
|
| 88 |
+
for i, key in enumerate(all_keys):
|
| 89 |
+
res = run_dataset(key, template, cache_mapping)
|
| 90 |
+
res_text_list.append(res)
|
| 91 |
+
print(res)
|
| 92 |
+
finally:
|
| 93 |
+
for fpath in fpaths:
|
| 94 |
+
with open(fpath, 'r', encoding='utf-8') as f:
|
| 95 |
+
text = f.read()
|
| 96 |
+
idx = text.find('| Dataset ID |')
|
| 97 |
+
|
| 98 |
+
new_text = '\n'.join(res_text_list)
|
| 99 |
+
text = text[:idx] + new_text + '\n'
|
| 100 |
+
with open(fpath, 'w', encoding='utf-8') as f:
|
| 101 |
+
f.write(text)
|
| 102 |
+
print(f'数据集总数: {len(all_keys)}')
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == '__main__':
|
| 106 |
+
write_dataset_info()
|
ms-swift/scripts/utils/run_template.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from swift.llm import TemplateType
|
| 2 |
+
|
| 3 |
+
if __name__ == '__main__':
|
| 4 |
+
template_name_list = TemplateType.get_template_name_list()
|
| 5 |
+
tn_gen = ', '.join([tn for tn in template_name_list if 'generation' in tn])
|
| 6 |
+
tn_chat = ', '.join([tn for tn in template_name_list if 'generation' not in tn])
|
| 7 |
+
print(f'Text Generation: {tn_gen}')
|
| 8 |
+
print(f'Chat: {tn_chat}')
|
ms-swift/swift/__init__.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
from typing import TYPE_CHECKING
|
| 3 |
+
|
| 4 |
+
from .utils.import_utils import _LazyModule
|
| 5 |
+
|
| 6 |
+
if TYPE_CHECKING:
|
| 7 |
+
from .version import __version__, __release_datetime__
|
| 8 |
+
from .tuners import (Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig, SWIFT_MAPPING,
|
| 9 |
+
AdaLoraConfig, LoftQConfig, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, PeftConfig,
|
| 10 |
+
PeftModel, PeftModelForCausalLM, ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM,
|
| 11 |
+
PeftModelForSequenceClassification, PeftModelForTokenClassification, PrefixTuningConfig,
|
| 12 |
+
PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, get_peft_config, get_peft_model,
|
| 13 |
+
get_peft_model_state_dict, Prompt, PromptConfig, PromptModule, SwiftConfig, SwiftOutput, Swift,
|
| 14 |
+
SwiftTuners, LongLoRAConfig, LongLoRA, LongLoRAModelType, SCETuning, SCETuningConfig)
|
| 15 |
+
from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy,
|
| 16 |
+
SchedulerType, ShardedDDPOption, TrainingArguments, Seq2SeqTrainingArguments, Trainer,
|
| 17 |
+
Seq2SeqTrainer)
|
| 18 |
+
from .utils import get_logger
|
| 19 |
+
else:
|
| 20 |
+
_import_structure = {
|
| 21 |
+
'version': ['__release_datetime__', '__version__'],
|
| 22 |
+
'tuners': [
|
| 23 |
+
'Adapter', 'AdapterConfig', 'AdapterModule', 'SwiftModel', 'LoRA', 'LoRAConfig', 'SWIFT_MAPPING',
|
| 24 |
+
'LoraConfig', 'AdaLoraConfig', 'LoftQConfig', 'LoHaConfig', 'LoKrConfig', 'OFTConfig', 'PeftConfig',
|
| 25 |
+
'ResTuningConfig', 'SideConfig', 'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM',
|
| 26 |
+
'PeftModelForSequenceClassification', 'PeftModelForTokenClassification', 'PrefixTuningConfig',
|
| 27 |
+
'PromptEncoderConfig', 'PromptLearningConfig', 'PromptTuningConfig', 'get_peft_config', 'get_peft_model',
|
| 28 |
+
'get_peft_model_state_dict', 'Prompt', 'PromptConfig', 'PromptModule', 'SwiftConfig', 'SwiftOutput',
|
| 29 |
+
'Swift', 'SwiftTuners', 'LongLoRAConfig', 'LongLoRA', 'LongLoRAModelType', 'SCETuning', 'SCETuningConfig'
|
| 30 |
+
],
|
| 31 |
+
'trainers': [
|
| 32 |
+
'EvaluationStrategy',
|
| 33 |
+
'FSDPOption',
|
| 34 |
+
'HPSearchBackend',
|
| 35 |
+
'HubStrategy',
|
| 36 |
+
'IntervalStrategy',
|
| 37 |
+
'SchedulerType',
|
| 38 |
+
'ShardedDDPOption',
|
| 39 |
+
'TrainingArguments',
|
| 40 |
+
'Seq2SeqTrainingArguments',
|
| 41 |
+
'Trainer',
|
| 42 |
+
'Seq2SeqTrainer',
|
| 43 |
+
],
|
| 44 |
+
'utils': ['get_logger']
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
import sys
|
| 48 |
+
|
| 49 |
+
sys.modules[__name__] = _LazyModule(
|
| 50 |
+
__name__,
|
| 51 |
+
globals()['__file__'],
|
| 52 |
+
_import_structure,
|
| 53 |
+
module_spec=__spec__,
|
| 54 |
+
extra_objects={},
|
| 55 |
+
)
|
ms-swift/swift/cli/__init__.py
ADDED
|
File without changes
|
ms-swift/swift/cli/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (164 Bytes). View file
|
|
|
ms-swift/swift/cli/_megatron/pt.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from swift.megatron import megatron_pt_main
|
| 2 |
+
|
| 3 |
+
if __name__ == '__main__':
|
| 4 |
+
megatron_pt_main()
|
ms-swift/swift/cli/_megatron/sft.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from swift.megatron import megatron_sft_main
|
| 2 |
+
|
| 3 |
+
if __name__ == '__main__':
|
| 4 |
+
megatron_sft_main()
|
ms-swift/swift/cli/app.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from swift.llm import app_main
|
| 2 |
+
|
| 3 |
+
if __name__ == '__main__':
|
| 4 |
+
app_main()
|
ms-swift/swift/cli/eval.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
from swift.llm import eval_main
|
| 3 |
+
|
| 4 |
+
if __name__ == '__main__':
|
| 5 |
+
eval_main()
|
ms-swift/swift/cli/export.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
from swift.llm import export_main
|
| 3 |
+
|
| 4 |
+
if __name__ == '__main__':
|
| 5 |
+
export_main()
|
ms-swift/swift/cli/main.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
import importlib.util
|
| 3 |
+
import os
|
| 4 |
+
import subprocess
|
| 5 |
+
import sys
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from swift.utils import get_logger
|
| 9 |
+
|
| 10 |
+
logger = get_logger()
|
| 11 |
+
|
| 12 |
+
ROUTE_MAPPING: Dict[str, str] = {
|
| 13 |
+
'pt': 'swift.cli.pt',
|
| 14 |
+
'sft': 'swift.cli.sft',
|
| 15 |
+
'infer': 'swift.cli.infer',
|
| 16 |
+
'merge-lora': 'swift.cli.merge_lora',
|
| 17 |
+
'web-ui': 'swift.cli.web_ui',
|
| 18 |
+
'deploy': 'swift.cli.deploy',
|
| 19 |
+
'rollout': 'swift.cli.rollout',
|
| 20 |
+
'rlhf': 'swift.cli.rlhf',
|
| 21 |
+
'sample': 'swift.cli.sample',
|
| 22 |
+
'export': 'swift.cli.export',
|
| 23 |
+
'eval': 'swift.cli.eval',
|
| 24 |
+
'app': 'swift.cli.app',
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def use_torchrun() -> bool:
|
| 29 |
+
nproc_per_node = os.getenv('NPROC_PER_NODE')
|
| 30 |
+
nnodes = os.getenv('NNODES')
|
| 31 |
+
if nproc_per_node is None and nnodes is None:
|
| 32 |
+
return False
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_torchrun_args() -> Optional[List[str]]:
|
| 37 |
+
if not use_torchrun():
|
| 38 |
+
return
|
| 39 |
+
torchrun_args = []
|
| 40 |
+
for env_key in ['NPROC_PER_NODE', 'MASTER_PORT', 'NNODES', 'NODE_RANK', 'MASTER_ADDR']:
|
| 41 |
+
env_val = os.getenv(env_key)
|
| 42 |
+
if env_val is None:
|
| 43 |
+
continue
|
| 44 |
+
torchrun_args += [f'--{env_key.lower()}', env_val]
|
| 45 |
+
return torchrun_args
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _compat_web_ui(argv):
|
| 49 |
+
# [compat]
|
| 50 |
+
method_name = argv[0]
|
| 51 |
+
if method_name in {'web-ui', 'web_ui'} and ('--model' in argv or '--adapters' in argv or '--ckpt_dir' in argv):
|
| 52 |
+
argv[0] = 'app'
|
| 53 |
+
logger.warning('Please use `swift app`.')
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def cli_main(route_mapping: Optional[Dict[str, str]] = None) -> None:
|
| 57 |
+
route_mapping = route_mapping or ROUTE_MAPPING
|
| 58 |
+
argv = sys.argv[1:]
|
| 59 |
+
_compat_web_ui(argv)
|
| 60 |
+
method_name = argv[0].replace('_', '-')
|
| 61 |
+
argv = argv[1:]
|
| 62 |
+
file_path = importlib.util.find_spec(route_mapping[method_name]).origin
|
| 63 |
+
torchrun_args = get_torchrun_args()
|
| 64 |
+
python_cmd = sys.executable
|
| 65 |
+
if torchrun_args is None or method_name not in {'pt', 'sft', 'rlhf', 'infer'}:
|
| 66 |
+
args = [python_cmd, file_path, *argv]
|
| 67 |
+
else:
|
| 68 |
+
args = [python_cmd, '-m', 'torch.distributed.run', *torchrun_args, file_path, *argv]
|
| 69 |
+
print(f"run sh: `{' '.join(args)}`", flush=True)
|
| 70 |
+
result = subprocess.run(args)
|
| 71 |
+
if result.returncode != 0:
|
| 72 |
+
sys.exit(result.returncode)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == '__main__':
|
| 76 |
+
cli_main()
|
ms-swift/swift/cli/pt.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
from swift.llm import pt_main
|
| 3 |
+
|
| 4 |
+
if __name__ == '__main__':
|
| 5 |
+
pt_main()
|
ms-swift/swift/cli/rollout.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
from swift.llm import rollout_main
|
| 3 |
+
|
| 4 |
+
if __name__ == '__main__':
|
| 5 |
+
rollout_main()
|
ms-swift/swift/hub/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (231 Bytes). View file
|
|
|
ms-swift/swift/hub/__pycache__/hub.cpython-310.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
ms-swift/swift/llm/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (3.92 kB). View file
|
|
|
ms-swift/swift/llm/__pycache__/data_loader.cpython-310.pyc
ADDED
|
Binary file (4.21 kB). View file
|
|
|
ms-swift/swift/llm/app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .app import SwiftApp, app_main
|
ms-swift/swift/llm/argument/app_args.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from typing import Literal, Optional
|
| 4 |
+
|
| 5 |
+
from swift.utils import find_free_port, get_logger
|
| 6 |
+
from ..model import get_matched_model_meta
|
| 7 |
+
from ..template import get_template_meta
|
| 8 |
+
from .deploy_args import DeployArguments
|
| 9 |
+
from .webui_args import WebUIArguments
|
| 10 |
+
|
| 11 |
+
logger = get_logger()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class AppArguments(WebUIArguments, DeployArguments):
|
| 16 |
+
base_url: Optional[str] = None
|
| 17 |
+
studio_title: Optional[str] = None
|
| 18 |
+
is_multimodal: Optional[bool] = None
|
| 19 |
+
|
| 20 |
+
lang: Literal['en', 'zh'] = 'en'
|
| 21 |
+
verbose: bool = False
|
| 22 |
+
|
| 23 |
+
def _init_torch_dtype(self) -> None:
|
| 24 |
+
if self.base_url:
|
| 25 |
+
self.model_meta = get_matched_model_meta(self.model)
|
| 26 |
+
return
|
| 27 |
+
super()._init_torch_dtype()
|
| 28 |
+
|
| 29 |
+
def __post_init__(self):
|
| 30 |
+
super().__post_init__()
|
| 31 |
+
self.server_port = find_free_port(self.server_port)
|
| 32 |
+
if self.model_meta:
|
| 33 |
+
if self.system is None:
|
| 34 |
+
self.system = get_template_meta(self.model_meta.template).default_system
|
| 35 |
+
if self.is_multimodal is None:
|
| 36 |
+
self.is_multimodal = self.model_meta.is_multimodal
|
| 37 |
+
if self.is_multimodal is None:
|
| 38 |
+
self.is_multimodal = False
|
ms-swift/swift/llm/data_loader.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.distributed as dist
|
| 5 |
+
from torch.utils.data import DataLoader
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class BatchSamplerShard:
|
| 9 |
+
|
| 10 |
+
def __init__(self, total_samples: int, batch_size: int, shuffle: bool, drop_last: bool, data_seed: Optional[int]):
|
| 11 |
+
self.total_samples = total_samples // self.world_size
|
| 12 |
+
self.batch_size = batch_size
|
| 13 |
+
self.shuffle = shuffle
|
| 14 |
+
self.drop_last = drop_last
|
| 15 |
+
self.base_seed = data_seed or 0
|
| 16 |
+
self.curr_seed = self.base_seed
|
| 17 |
+
|
| 18 |
+
@property
|
| 19 |
+
def rank(self):
|
| 20 |
+
return dist.get_rank() if dist.is_initialized() else 0
|
| 21 |
+
|
| 22 |
+
@property
|
| 23 |
+
def world_size(self):
|
| 24 |
+
return dist.get_world_size() if dist.is_initialized() else 1
|
| 25 |
+
|
| 26 |
+
def __iter__(self):
|
| 27 |
+
start_idx = self.rank * self.total_samples
|
| 28 |
+
if self.shuffle:
|
| 29 |
+
generator = torch.Generator()
|
| 30 |
+
generator.manual_seed(self.curr_seed)
|
| 31 |
+
total_idx = torch.randperm(self.total_samples * self.world_size, generator=generator).tolist()
|
| 32 |
+
total_idx = total_idx[start_idx:start_idx + self.total_samples]
|
| 33 |
+
else:
|
| 34 |
+
total_idx = list(range(start_idx, start_idx + self.total_samples))
|
| 35 |
+
|
| 36 |
+
batch = []
|
| 37 |
+
# Last batch if not complete will be dropped.
|
| 38 |
+
for idx in total_idx:
|
| 39 |
+
batch.append(idx)
|
| 40 |
+
if len(batch) == self.batch_size:
|
| 41 |
+
yield batch
|
| 42 |
+
batch = []
|
| 43 |
+
if not self.drop_last and len(batch) > 0:
|
| 44 |
+
yield batch
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
def set_epoch(self, epoch: int):
|
| 48 |
+
self.curr_seed = self.base_seed + epoch
|
| 49 |
+
|
| 50 |
+
def __len__(self) -> int:
|
| 51 |
+
if self.drop_last:
|
| 52 |
+
return self.total_samples // self.batch_size
|
| 53 |
+
else:
|
| 54 |
+
return (self.total_samples + self.batch_size - 1) // self.batch_size
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class DataLoaderShard(DataLoader):
|
| 58 |
+
|
| 59 |
+
def __init__(self, dataset, batch_sampler: BatchSamplerShard, **dataloader_params):
|
| 60 |
+
self.batch_sampler = batch_sampler
|
| 61 |
+
super().__init__(dataset, batch_sampler=self.batch_sampler, **dataloader_params)
|
| 62 |
+
|
| 63 |
+
def set_epoch(self, epoch: int):
|
| 64 |
+
self.batch_sampler.set_epoch(epoch)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class DataLoaderDispatcher:
|
| 68 |
+
|
| 69 |
+
def __init__(self, base_dataloader):
|
| 70 |
+
self.base_dataloader = base_dataloader
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def rank(self):
|
| 74 |
+
return dist.get_rank(self.group) if dist.is_initialized() else 0
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def world_size(self):
|
| 78 |
+
return dist.get_world_size(self.group) if dist.is_initialized() else 1
|
| 79 |
+
|
| 80 |
+
@property
|
| 81 |
+
def group(self):
|
| 82 |
+
return dist.group.WORLD if dist.is_initialized() else 1
|
| 83 |
+
|
| 84 |
+
def _scatter_object_list(self, inputs):
|
| 85 |
+
if not dist.is_initialized():
|
| 86 |
+
return inputs[0]
|
| 87 |
+
outputs = [None]
|
| 88 |
+
global_src_rank = dist.get_global_rank(self.group, 0)
|
| 89 |
+
dist.scatter_object_list(outputs, inputs, global_src_rank, group=self.group)
|
| 90 |
+
return outputs[0]
|
| 91 |
+
|
| 92 |
+
def __iter__(self):
|
| 93 |
+
base_iter = iter(self.base_dataloader)
|
| 94 |
+
while True:
|
| 95 |
+
if self.rank == 0:
|
| 96 |
+
try:
|
| 97 |
+
data = [next(base_iter) for _ in range(self.world_size)]
|
| 98 |
+
except StopIteration:
|
| 99 |
+
data = [None] * self.world_size
|
| 100 |
+
data = self._scatter_object_list(data)
|
| 101 |
+
else:
|
| 102 |
+
data = self._scatter_object_list(None)
|
| 103 |
+
if data is None:
|
| 104 |
+
break
|
| 105 |
+
yield data
|
ms-swift/swift/llm/dataset/dataset/__pycache__/mllm.cpython-310.pyc
ADDED
|
Binary file (36 kB). View file
|
|
|
ms-swift/swift/llm/dataset/dataset/llm.py
ADDED
|
@@ -0,0 +1,856 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
import ast
|
| 3 |
+
import re
|
| 4 |
+
from functools import partial
|
| 5 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from ...template import split_str_parts_by
|
| 11 |
+
from ..preprocessor import (AlpacaPreprocessor, ClsGenerationPreprocessor, ClsPreprocessor, MessagesPreprocessor,
|
| 12 |
+
ResponsePreprocessor, RowPreprocessor, TextGenerationPreprocessor)
|
| 13 |
+
from ..register import DatasetMeta, SubsetDataset, register_dataset
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AlpacaZhPreprocessor(AlpacaPreprocessor):
|
| 17 |
+
|
| 18 |
+
@classmethod
|
| 19 |
+
def concat_inst_input(cls, instruction, input_):
|
| 20 |
+
if input_ and input_.startswith('输入:'):
|
| 21 |
+
input_ = input_[3:]
|
| 22 |
+
return super().concat_inst_input(instruction, input_)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
register_dataset(
|
| 26 |
+
DatasetMeta(
|
| 27 |
+
ms_dataset_id='AI-ModelScope/alpaca-gpt4-data-zh',
|
| 28 |
+
hf_dataset_id='llm-wizard/alpaca-gpt4-data-zh',
|
| 29 |
+
preprocess_func=AlpacaZhPreprocessor(),
|
| 30 |
+
tags=['chat', 'general', '🔥'],
|
| 31 |
+
))
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class LongAlpacaPreprocessor(AlpacaPreprocessor):
|
| 35 |
+
|
| 36 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 37 |
+
response = row['response']
|
| 38 |
+
prefix_prompt = 'Answer: '
|
| 39 |
+
if response and response.startswith(prefix_prompt):
|
| 40 |
+
response = response[len(prefix_prompt):].strip()
|
| 41 |
+
row['output'] = response
|
| 42 |
+
return super().preprocess(row)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
register_dataset(
|
| 46 |
+
DatasetMeta(
|
| 47 |
+
ms_dataset_id='AI-ModelScope/LongAlpaca-12k',
|
| 48 |
+
hf_dataset_id='Yukang/LongAlpaca-12k',
|
| 49 |
+
preprocess_func=LongAlpacaPreprocessor(),
|
| 50 |
+
tags=['long-sequence', 'QA'],
|
| 51 |
+
))
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class RuozhibaPreprocessor(RowPreprocessor):
|
| 55 |
+
|
| 56 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 57 |
+
title = row['title'] if row.get('title', None) is not None else row['content']
|
| 58 |
+
abs = row['abs'] if 'abs' in row else None
|
| 59 |
+
if abs and abs != title:
|
| 60 |
+
title = title + ',' + abs
|
| 61 |
+
|
| 62 |
+
pattern = r'\d+[\.,\s,\、](.+)'
|
| 63 |
+
match = re.search(pattern, title)
|
| 64 |
+
if match:
|
| 65 |
+
title = match.group(1)
|
| 66 |
+
if title:
|
| 67 |
+
return {'messages': [{'role': 'assistant', 'content': title}]}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
register_dataset(
|
| 71 |
+
DatasetMeta(
|
| 72 |
+
ms_dataset_id='AI-ModelScope/ruozhiba',
|
| 73 |
+
subsets=['post-annual', 'title-good', 'title-norm'],
|
| 74 |
+
preprocess_func=RuozhibaPreprocessor(),
|
| 75 |
+
tags=['pretrain', '🔥']))
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class MathTrnPreprocessor(ResponsePreprocessor):
|
| 79 |
+
|
| 80 |
+
def preprocess(self, row):
|
| 81 |
+
query = row['query']
|
| 82 |
+
output = row['response']
|
| 83 |
+
row = {
|
| 84 |
+
'query': query,
|
| 85 |
+
'response': output,
|
| 86 |
+
}
|
| 87 |
+
return super().preprocess(row)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
register_dataset(
|
| 91 |
+
DatasetMeta(ms_dataset_id='AI-ModelScope/math-trn-format', preprocess_func=MathTrnPreprocessor(), tags=['math']))
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _repair_ms_bench(messages: str) -> Optional[List[Dict[str, str]]]:
|
| 95 |
+
if isinstance(messages, str):
|
| 96 |
+
messages = ast.literal_eval(messages)
|
| 97 |
+
default_system = 'You are a helpful assistant.'
|
| 98 |
+
messages: List[Dict[str, str]]
|
| 99 |
+
if messages[0]['from'] == 'system' and messages[0]['value'] == default_system:
|
| 100 |
+
messages.pop(0)
|
| 101 |
+
# skip MOSS
|
| 102 |
+
for c in messages:
|
| 103 |
+
value = c['value'].lower()
|
| 104 |
+
if 'moss' in value or 'human:' in value or 'assistant:' in value or 'user:' in value:
|
| 105 |
+
return
|
| 106 |
+
return messages
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
register_dataset(
|
| 110 |
+
DatasetMeta(
|
| 111 |
+
ms_dataset_id='iic/ms_bench',
|
| 112 |
+
preprocess_func=MessagesPreprocessor(repair_messages=_repair_ms_bench),
|
| 113 |
+
tags=['chat', 'general', 'multi-round', '🔥']))
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _repair_agent_messages(messages: List[Dict[str, str]], use_mini: bool) -> Optional[List[Dict[str, str]]]:
|
| 117 |
+
if use_mini:
|
| 118 |
+
pattern = r'\d\. {"plugin_name": "(.+?)"'
|
| 119 |
+
if messages[0]['from'] != 'system':
|
| 120 |
+
return
|
| 121 |
+
system = messages[0]['value']
|
| 122 |
+
find_list = re.findall(pattern, system)
|
| 123 |
+
if len(set(find_list)) <= 1:
|
| 124 |
+
return
|
| 125 |
+
return messages
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
register_dataset(
|
| 129 |
+
DatasetMeta(
|
| 130 |
+
ms_dataset_id='damo/MSAgent-Bench',
|
| 131 |
+
subsets=[
|
| 132 |
+
SubsetDataset(
|
| 133 |
+
preprocess_func=MessagesPreprocessor(repair_messages=partial(_repair_agent_messages, use_mini=False))),
|
| 134 |
+
SubsetDataset(
|
| 135 |
+
name='mini',
|
| 136 |
+
preprocess_func=MessagesPreprocessor(repair_messages=partial(_repair_agent_messages, use_mini=True)),
|
| 137 |
+
is_weak_subset=True)
|
| 138 |
+
],
|
| 139 |
+
split=['train', 'validation'],
|
| 140 |
+
tags=['chat', 'agent', 'multi-round']))
|
| 141 |
+
|
| 142 |
+
advertise_gen_prompt = """Task: Generating advertisements based on keywords.
|
| 143 |
+
Keywords: {{QUERY}}
|
| 144 |
+
Advertisements:"""
|
| 145 |
+
|
| 146 |
+
register_dataset(
|
| 147 |
+
DatasetMeta(
|
| 148 |
+
ms_dataset_id='lvjianjin/AdvertiseGen',
|
| 149 |
+
hf_dataset_id='shibing624/AdvertiseGen',
|
| 150 |
+
preprocess_func=TextGenerationPreprocessor(
|
| 151 |
+
prompt=advertise_gen_prompt, columns={
|
| 152 |
+
'content': 'query',
|
| 153 |
+
'summary': 'response'
|
| 154 |
+
}),
|
| 155 |
+
tags=['text-generation', '🔥'],
|
| 156 |
+
split=['train', 'validation'],
|
| 157 |
+
))
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class FireflyPreprocessor(ResponsePreprocessor):
|
| 161 |
+
_firefly_kind_list = {
|
| 162 |
+
'ProseGeneration', 'MRC', 'JinYongGeneration', 'TextCorrection', 'ClassicalChinese', 'BELLE', 'StoryGeneration',
|
| 163 |
+
'Couplet', 'Cot', 'Dictionary', 'Translation', 'Program', 'SentimentAnalyze', 'OpenQA', 'AncientPoem',
|
| 164 |
+
'TextMatching', 'NLI', 'Summary', 'KeywordRecognition', 'ProductDesc', 'LyricGeneration', 'Composition',
|
| 165 |
+
'MusicComment', 'NER'
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 169 |
+
if row['kind'] not in FireflyPreprocessor._firefly_kind_list:
|
| 170 |
+
return
|
| 171 |
+
return super().preprocess(row)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
register_dataset(
|
| 175 |
+
DatasetMeta(
|
| 176 |
+
ms_dataset_id='AI-ModelScope/firefly-train-1.1M',
|
| 177 |
+
hf_dataset_id='YeungNLP/firefly-train-1.1M',
|
| 178 |
+
preprocess_func=FireflyPreprocessor(),
|
| 179 |
+
tags=['chat', 'general'],
|
| 180 |
+
))
|
| 181 |
+
|
| 182 |
+
register_dataset(
|
| 183 |
+
DatasetMeta(
|
| 184 |
+
ms_dataset_id='modelscope/clue',
|
| 185 |
+
hf_dataset_id='clue',
|
| 186 |
+
subsets=['cmnli'],
|
| 187 |
+
preprocess_func=ClsGenerationPreprocessor(['neutral', 'entailment', 'contradiction'],
|
| 188 |
+
task='Natural Language Inference',
|
| 189 |
+
is_pair_seq=True),
|
| 190 |
+
tags=['text-generation', 'classification'],
|
| 191 |
+
split=['train', 'validation'],
|
| 192 |
+
))
|
| 193 |
+
|
| 194 |
+
register_dataset(
|
| 195 |
+
DatasetMeta(
|
| 196 |
+
ms_dataset_id='DAMO_NLP/jd',
|
| 197 |
+
subsets=[
|
| 198 |
+
SubsetDataset(
|
| 199 |
+
'default',
|
| 200 |
+
'default',
|
| 201 |
+
preprocess_func=ClsGenerationPreprocessor(['negative', 'positive'],
|
| 202 |
+
task='Sentiment Classification',
|
| 203 |
+
is_pair_seq=False)),
|
| 204 |
+
SubsetDataset(
|
| 205 |
+
'cls',
|
| 206 |
+
'default',
|
| 207 |
+
preprocess_func=ClsPreprocessor(columns={'sentence': 'query'}),
|
| 208 |
+
),
|
| 209 |
+
],
|
| 210 |
+
tags=['text-generation', 'classification', '🔥'],
|
| 211 |
+
split=['train', 'validation'],
|
| 212 |
+
))
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
class SyntheticText2SqlPreprocessor(ResponsePreprocessor):
|
| 216 |
+
|
| 217 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 218 |
+
sql_prompt = row['sql_prompt']
|
| 219 |
+
sql_context = row['sql_context']
|
| 220 |
+
sql = row['sql']
|
| 221 |
+
sql_explanation = row['sql_explanation']
|
| 222 |
+
query = f'Sql Table information:\n{sql_context}\n{sql_prompt}'
|
| 223 |
+
response = f'Let\'s think step by step:\n{sql_explanation}\nSo the final sql is:\n{sql}'
|
| 224 |
+
return super().preprocess({'query': query, 'response': response})
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
register_dataset(
|
| 228 |
+
DatasetMeta(
|
| 229 |
+
ms_dataset_id='AI-ModelScope/synthetic_text_to_sql',
|
| 230 |
+
hf_dataset_id='gretelai/synthetic_text_to_sql',
|
| 231 |
+
preprocess_func=SyntheticText2SqlPreprocessor(),
|
| 232 |
+
tags=['nl2sql', 'en']))
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _repair_toolbench(conversations: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
| 236 |
+
assert len(conversations) == 2
|
| 237 |
+
if conversations[1]['from'] in {'caller', 'conclusion'}:
|
| 238 |
+
conversations[1]['from'] = 'assistant'
|
| 239 |
+
return conversations
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
register_dataset(
|
| 243 |
+
DatasetMeta(
|
| 244 |
+
ms_dataset_id='shenweizhou/alpha-umi-toolbench-processed-v2',
|
| 245 |
+
subsets=['backbone', 'caller', 'planner', 'summarizer'],
|
| 246 |
+
preprocess_func=MessagesPreprocessor(repair_messages=_repair_toolbench),
|
| 247 |
+
tags=['chat', 'agent', '🔥'],
|
| 248 |
+
huge_dataset=True))
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
class BlossomMathPreprocessor(ResponsePreprocessor):
|
| 252 |
+
|
| 253 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 254 |
+
output, answer = row['output'], row['answer']
|
| 255 |
+
return super().preprocess({'query': row['query'], 'response': f'{output}\n\nAnswer: {answer}'})
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
register_dataset(
|
| 259 |
+
DatasetMeta(
|
| 260 |
+
ms_dataset_id='AI-ModelScope/blossom-math-v2',
|
| 261 |
+
hf_dataset_id='Azure99/blossom-math-v2',
|
| 262 |
+
preprocess_func=BlossomMathPreprocessor(),
|
| 263 |
+
tags=['chat', 'math', '🔥']))
|
| 264 |
+
|
| 265 |
+
register_dataset(
|
| 266 |
+
DatasetMeta(
|
| 267 |
+
ms_dataset_id='AI-ModelScope/sql-create-context',
|
| 268 |
+
hf_dataset_id='b-mc2/sql-create-context',
|
| 269 |
+
preprocess_func=AlpacaPreprocessor(columns={
|
| 270 |
+
'question': 'instruction',
|
| 271 |
+
'context': 'input',
|
| 272 |
+
'answer': 'output'
|
| 273 |
+
}),
|
| 274 |
+
tags=['chat', 'sql', '🔥']))
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
class TigerBotLawPreprocessor(ResponsePreprocessor):
|
| 278 |
+
|
| 279 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 280 |
+
prompt = """{type}
|
| 281 |
+
{title}
|
| 282 |
+
"""
|
| 283 |
+
cur_prompt = prompt.format(type=row['type'], title=row['title'])
|
| 284 |
+
for i in range(1, 4):
|
| 285 |
+
chapter = row[f'chapter{i}']
|
| 286 |
+
if chapter is not None:
|
| 287 |
+
cur_prompt += f'{chapter}'
|
| 288 |
+
cur_prompt += f'{row["response"]}'
|
| 289 |
+
return super().preprocess({'response': cur_prompt})
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
register_dataset(
|
| 293 |
+
DatasetMeta(
|
| 294 |
+
ms_dataset_id='AI-ModelScope/tigerbot-law-plugin',
|
| 295 |
+
hf_dataset_id='TigerResearch/tigerbot-law-plugin',
|
| 296 |
+
preprocess_func=TigerBotLawPreprocessor(),
|
| 297 |
+
tags=['text-generation', 'law', 'pretrained']))
|
| 298 |
+
|
| 299 |
+
register_dataset(
|
| 300 |
+
DatasetMeta(
|
| 301 |
+
ms_dataset_id='codefuse-ai/CodeExercise-Python-27k',
|
| 302 |
+
preprocess_func=MessagesPreprocessor(columns={'chat_rounds': 'messages'}),
|
| 303 |
+
tags=['chat', 'coding', '🔥']))
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
class LeetcodePythonPreprocessor(ResponsePreprocessor):
|
| 307 |
+
|
| 308 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 309 |
+
code_with_problem = row['code_with_problem']
|
| 310 |
+
idx = code_with_problem.find('```python')
|
| 311 |
+
problem = code_with_problem[:idx]
|
| 312 |
+
if problem.startswith('# '):
|
| 313 |
+
problem = problem[2:]
|
| 314 |
+
code = code_with_problem[idx:].strip()
|
| 315 |
+
explanation = row['explanation_only']
|
| 316 |
+
return super().preprocess({'query': problem, 'response': f'{code}\n\n{explanation}'})
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
register_dataset(
|
| 320 |
+
DatasetMeta(
|
| 321 |
+
ms_dataset_id='AI-ModelScope/leetcode-solutions-python',
|
| 322 |
+
preprocess_func=LeetcodePythonPreprocessor(),
|
| 323 |
+
tags=['chat', 'coding', '🔥']))
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
class StsbPreprocessor(ResponsePreprocessor):
|
| 327 |
+
|
| 328 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 329 |
+
row = {
|
| 330 |
+
'query': row['sentence1'],
|
| 331 |
+
'response': row['sentence2'],
|
| 332 |
+
'label': row['score'],
|
| 333 |
+
}
|
| 334 |
+
return super().preprocess(row)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
class StsbGeneratePreprocessor(ResponsePreprocessor):
|
| 338 |
+
prompt = """Task: Based on the given two sentences, provide a similarity score between 0.0 and 1.0.
|
| 339 |
+
Sentence 1: {text1}
|
| 340 |
+
Sentence 2: {text2}
|
| 341 |
+
Similarity score: """
|
| 342 |
+
|
| 343 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 344 |
+
return super().preprocess({
|
| 345 |
+
'query': self.prompt.format(text1=row['sentence1'], text2=row['sentence2']),
|
| 346 |
+
'response': f"{row['score']:.1f}"
|
| 347 |
+
})
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
class StsbRegressionPreprocessor(StsbGeneratePreprocessor):
|
| 351 |
+
|
| 352 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 353 |
+
return super(StsbGeneratePreprocessor, self).preprocess({
|
| 354 |
+
'query':
|
| 355 |
+
self.prompt.format(text1=row['sentence1'], text2=row['sentence2']),
|
| 356 |
+
'label':
|
| 357 |
+
row['score']
|
| 358 |
+
})
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
register_dataset(
|
| 362 |
+
DatasetMeta(
|
| 363 |
+
ms_dataset_id='sentence-transformers/stsb',
|
| 364 |
+
hf_dataset_id='sentence-transformers/stsb',
|
| 365 |
+
subsets=[
|
| 366 |
+
SubsetDataset('default', preprocess_func=StsbPreprocessor()), # embedding
|
| 367 |
+
SubsetDataset('generate', preprocess_func=StsbGeneratePreprocessor()),
|
| 368 |
+
SubsetDataset('reg', preprocess_func=StsbRegressionPreprocessor()),
|
| 369 |
+
],
|
| 370 |
+
tags=['similarity', '🔥']))
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def _repair_conversations_agent_instruct(s: str) -> List[Dict[str, Any]]:
|
| 374 |
+
s = s.replace('}\n {', '},\n {')
|
| 375 |
+
if isinstance(s, str):
|
| 376 |
+
s = ast.literal_eval(s)
|
| 377 |
+
return s
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
register_dataset(
|
| 381 |
+
DatasetMeta(
|
| 382 |
+
ms_dataset_id='huangjintao/AgentInstruct_copy',
|
| 383 |
+
subsets=['alfworld', 'db', 'kg', 'mind2web', 'os', 'webshop'],
|
| 384 |
+
preprocess_func=MessagesPreprocessor(repair_messages=_repair_conversations_agent_instruct),
|
| 385 |
+
tags=['chat', 'agent', 'multi-round']))
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
class MultiRoleAgentPreprocessor(RowPreprocessor):
|
| 389 |
+
|
| 390 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 391 |
+
conv = row['conversations']
|
| 392 |
+
res_prompt = '\n\n【注意事项】\n1. 这是聊天室,不要发送私信给任何人\n2. 仅代表你个人说话,不要扮演其他人,只根据对话历史进行回复\n3. 长话短说,不要说太多话,不要超过50字 '
|
| 393 |
+
history_prompt = '\n\n【chat history】'
|
| 394 |
+
conv_prompt = '\n {name}:{content}'
|
| 395 |
+
query, response = '', conv[-1]['value']
|
| 396 |
+
system = conv[0]['value'] if conv[0]['from'] == 'system' else ''
|
| 397 |
+
if conv[0]['from'] == 'user':
|
| 398 |
+
query = conv[0]['value']
|
| 399 |
+
elif 'next_speakers:' not in system:
|
| 400 |
+
if '【注意事项】' not in system and system:
|
| 401 |
+
system += res_prompt
|
| 402 |
+
system += history_prompt
|
| 403 |
+
system += ''.join([conv_prompt.format(name=c['from'], content=c['value']) for c in conv[1:-1]])
|
| 404 |
+
|
| 405 |
+
if not query or not response:
|
| 406 |
+
return
|
| 407 |
+
|
| 408 |
+
return {
|
| 409 |
+
'messages': [{
|
| 410 |
+
'role': 'system',
|
| 411 |
+
'content': system
|
| 412 |
+
}, {
|
| 413 |
+
'role': 'user',
|
| 414 |
+
'content': query
|
| 415 |
+
}, {
|
| 416 |
+
'role': 'assistant',
|
| 417 |
+
'content': response
|
| 418 |
+
}],
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
register_dataset(
|
| 423 |
+
DatasetMeta(
|
| 424 |
+
ms_dataset_id='iic/MSAgent-MultiRole',
|
| 425 |
+
preprocess_func=MultiRoleAgentPreprocessor(),
|
| 426 |
+
tags=['chat', 'agent', 'multi-round', 'role-play', 'multi-agent']))
|
| 427 |
+
|
| 428 |
+
register_dataset(DatasetMeta(ms_dataset_id='swift/ToolBench', tags=['chat', 'agent', 'multi-round']))
|
| 429 |
+
|
| 430 |
+
register_dataset(
|
| 431 |
+
DatasetMeta(
|
| 432 |
+
ms_dataset_id='tastelikefeet/competition_math',
|
| 433 |
+
subsets=[
|
| 434 |
+
SubsetDataset(
|
| 435 |
+
name='default',
|
| 436 |
+
subset='default',
|
| 437 |
+
split=['train', 'test'],
|
| 438 |
+
),
|
| 439 |
+
],
|
| 440 |
+
tags=['qa', 'math']))
|
| 441 |
+
|
| 442 |
+
register_dataset(DatasetMeta(ms_dataset_id='modelscope/gsm8k', subsets=['main'], split=['train'], tags=['qa', 'math']))
|
| 443 |
+
|
| 444 |
+
register_dataset(
|
| 445 |
+
DatasetMeta(ms_dataset_id='modelscope/MathR', subsets=['default', 'clean'], split=['train'], tags=['qa', 'math']))
|
| 446 |
+
|
| 447 |
+
register_dataset(
|
| 448 |
+
DatasetMeta(ms_dataset_id='modelscope/MathR-32B-Distill', subsets=['data'], split=['train'], tags=['qa', 'math']))
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
class CoundownTaskPreprocessor(ResponsePreprocessor):
|
| 452 |
+
|
| 453 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 454 |
+
numbers = row['nums']
|
| 455 |
+
target = row.pop('response', None)
|
| 456 |
+
query = (f'Using the numbers {numbers}, create an equation that equals {target}.\n'
|
| 457 |
+
'You can use basic arithmetic operations (+, -, *, /) and each number can only be used once.\n'
|
| 458 |
+
'Show your work in <think> </think> tags. And return the final equation and answer '
|
| 459 |
+
'in <answer> </answer> tags, for example <answer> (1 + 2) / 3 * 4 = 4 </answer>.')
|
| 460 |
+
row.update({'target': target, 'query': query})
|
| 461 |
+
return super().preprocess(row)
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
register_dataset(
|
| 465 |
+
DatasetMeta(
|
| 466 |
+
ms_dataset_id='zouxuhong/Countdown-Tasks-3to4',
|
| 467 |
+
subsets=['default'],
|
| 468 |
+
preprocess_func=CoundownTaskPreprocessor(),
|
| 469 |
+
tags=['math']))
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
class HC3Preprocessor(ResponsePreprocessor):
|
| 473 |
+
prompt = """Classification Task: Are the following responses from a human or from ChatGPT?
|
| 474 |
+
Question: {question}
|
| 475 |
+
Answer: {answer}
|
| 476 |
+
Category: Human, ChatGPT
|
| 477 |
+
Output:"""
|
| 478 |
+
|
| 479 |
+
def preprocess(self, row):
|
| 480 |
+
rows = []
|
| 481 |
+
for response in ['Human', 'ChatGPT']:
|
| 482 |
+
query = self.prompt.format(
|
| 483 |
+
question=row['query'], answer=self.random_state.choice(row[f'{response.lower()}_answers']))
|
| 484 |
+
rows.append(super().preprocess({'query': query, 'response': response}))
|
| 485 |
+
return rows
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
class HC3ClsPreprocessor(HC3Preprocessor):
|
| 489 |
+
|
| 490 |
+
def preprocess(self, row):
|
| 491 |
+
rows = []
|
| 492 |
+
for i, response in enumerate(['Human', 'ChatGPT']):
|
| 493 |
+
query = self.prompt.format(
|
| 494 |
+
question=row['query'], answer=self.random_state.choice(row[f'{response.lower()}_answers']))
|
| 495 |
+
rows.append(ResponsePreprocessor.preprocess(self, {'query': query, 'label': i}))
|
| 496 |
+
return rows
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
hc3_subset_names = ['baike', 'open_qa', 'nlpcc_dbqa', 'finance', 'medicine', 'law', 'psychology']
|
| 500 |
+
hc3_subsets: List[SubsetDataset] = []
|
| 501 |
+
for hc3_subset_name in hc3_subset_names:
|
| 502 |
+
hc3_subsets.append(
|
| 503 |
+
SubsetDataset(
|
| 504 |
+
name=hc3_subset_name,
|
| 505 |
+
subset=hc3_subset_name,
|
| 506 |
+
preprocess_func=HC3Preprocessor(),
|
| 507 |
+
))
|
| 508 |
+
hc3_subsets.append(
|
| 509 |
+
SubsetDataset(
|
| 510 |
+
name=f'{hc3_subset_name}_cls',
|
| 511 |
+
subset=hc3_subset_name,
|
| 512 |
+
preprocess_func=HC3ClsPreprocessor(),
|
| 513 |
+
))
|
| 514 |
+
|
| 515 |
+
register_dataset(
|
| 516 |
+
DatasetMeta(
|
| 517 |
+
ms_dataset_id='simpleai/HC3-Chinese',
|
| 518 |
+
hf_dataset_id='Hello-SimpleAI/HC3-Chinese',
|
| 519 |
+
subsets=hc3_subsets,
|
| 520 |
+
tags=['text-generation', 'classification', '🔥']))
|
| 521 |
+
|
| 522 |
+
hc3_subset_names = ['finance', 'medicine']
|
| 523 |
+
hc3_subsets: List[SubsetDataset] = []
|
| 524 |
+
for hc3_subset_name in hc3_subset_names:
|
| 525 |
+
hc3_subsets.append(
|
| 526 |
+
SubsetDataset(
|
| 527 |
+
name=hc3_subset_name,
|
| 528 |
+
subset=hc3_subset_name,
|
| 529 |
+
preprocess_func=HC3Preprocessor(),
|
| 530 |
+
))
|
| 531 |
+
hc3_subsets.append(
|
| 532 |
+
SubsetDataset(
|
| 533 |
+
name=f'{hc3_subset_name}_cls',
|
| 534 |
+
subset=hc3_subset_name,
|
| 535 |
+
preprocess_func=HC3ClsPreprocessor(),
|
| 536 |
+
))
|
| 537 |
+
|
| 538 |
+
register_dataset(
|
| 539 |
+
DatasetMeta(
|
| 540 |
+
ms_dataset_id='simpleai/HC3',
|
| 541 |
+
hf_dataset_id='Hello-SimpleAI/HC3',
|
| 542 |
+
subsets=hc3_subsets,
|
| 543 |
+
preprocess_func=HC3Preprocessor(),
|
| 544 |
+
tags=['text-generation', 'classification', '🔥']))
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
class DureaderPreprocessor(RowPreprocessor):
|
| 548 |
+
|
| 549 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 550 |
+
prompt = """Task: Question Generation
|
| 551 |
+
Context: {context}
|
| 552 |
+
Answer: {answer}
|
| 553 |
+
Question:"""
|
| 554 |
+
answer, context = row['text1'].split('[SEP]')
|
| 555 |
+
return {
|
| 556 |
+
'messages': [{
|
| 557 |
+
'role': 'user',
|
| 558 |
+
'content': prompt.format(context=context, answer=answer)
|
| 559 |
+
}, {
|
| 560 |
+
'role': 'assistant',
|
| 561 |
+
'content': row['text2']
|
| 562 |
+
}]
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
register_dataset(
|
| 567 |
+
DatasetMeta(
|
| 568 |
+
ms_dataset_id='modelscope/DuReader_robust-QG',
|
| 569 |
+
preprocess_func=DureaderPreprocessor(),
|
| 570 |
+
split=['train', 'validation', 'test'],
|
| 571 |
+
tags=['text-generation', '🔥']))
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
class HHRLHFPreprocessor(RowPreprocessor):
|
| 575 |
+
|
| 576 |
+
@staticmethod
|
| 577 |
+
def _to_messages(data):
|
| 578 |
+
messages = []
|
| 579 |
+
for query, response in zip(data[::2], data[1::2]):
|
| 580 |
+
messages.append({'role': 'user', 'content': query})
|
| 581 |
+
messages.append({'role': 'assistant', 'content': response})
|
| 582 |
+
return messages
|
| 583 |
+
|
| 584 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 585 |
+
chosen = row['chosen'].strip()
|
| 586 |
+
rejected = row['rejected'].strip()
|
| 587 |
+
parts_chosen = [s.strip() for s in re.split('\n\nHuman:|\n\nAssistant:|\n\nHum:', chosen)]
|
| 588 |
+
parts_rejected = [s.strip() for s in re.split('\n\nHuman:|\n\nAssistant:|\n\nHum:', rejected)]
|
| 589 |
+
if parts_chosen[0].startswith('Human:'):
|
| 590 |
+
assert parts_rejected[0].startswith('Human:')
|
| 591 |
+
parts_chosen[0] = parts_chosen[0][6:].strip()
|
| 592 |
+
parts_rejected[0] = parts_rejected[0][6:].strip()
|
| 593 |
+
row['messages'] = self._to_messages(parts_chosen)
|
| 594 |
+
row['rejected_messages'] = self._to_messages(parts_rejected)
|
| 595 |
+
return row
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
# TODO meta file broken
|
| 599 |
+
register_dataset(
|
| 600 |
+
DatasetMeta(
|
| 601 |
+
ms_dataset_id='AI-ModelScope/hh-rlhf',
|
| 602 |
+
subsets=['helpful-base', 'helpful-online', 'helpful-rejection-sampled'],
|
| 603 |
+
preprocess_func=HHRLHFPreprocessor(),
|
| 604 |
+
split=['train', 'test'],
|
| 605 |
+
tags=['rlhf', 'dpo'],
|
| 606 |
+
huge_dataset=True))
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
class XlamFunctionCallingPreprocessor(ResponsePreprocessor):
|
| 610 |
+
|
| 611 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 612 |
+
query = row['query']
|
| 613 |
+
answers = row['response']
|
| 614 |
+
if isinstance(answers, str):
|
| 615 |
+
answers = json.loads(answers)
|
| 616 |
+
answer = np.random.choice(answers)
|
| 617 |
+
name = answer['name']
|
| 618 |
+
args = json.dumps(answer['arguments'])
|
| 619 |
+
response = f'Action: {name}\nAction Input: {args}'
|
| 620 |
+
row = {'query': query, 'response': response, 'solution': response, 'tools': row['tools']}
|
| 621 |
+
return super().preprocess(row)
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
register_dataset(
|
| 625 |
+
DatasetMeta(
|
| 626 |
+
ms_dataset_id='LLM-Research/xlam-function-calling-60k',
|
| 627 |
+
subsets=['dataset'],
|
| 628 |
+
preprocess_func=XlamFunctionCallingPreprocessor(),
|
| 629 |
+
tags=['agent']))
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
class HHRLHFCNPreprocessor(MessagesPreprocessor):
|
| 633 |
+
|
| 634 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 635 |
+
row['messages'].append(row.pop('chosen'))
|
| 636 |
+
row['rejected_response'] = row['rejected']['text']
|
| 637 |
+
return super().preprocess(row)
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
register_dataset(
|
| 641 |
+
DatasetMeta(
|
| 642 |
+
ms_dataset_id='AI-ModelScope/hh_rlhf_cn',
|
| 643 |
+
subsets=['hh_rlhf', 'harmless_base_cn', 'harmless_base_en', 'helpful_base_cn', 'helpful_base_en'],
|
| 644 |
+
preprocess_func=HHRLHFCNPreprocessor(columns={'context': 'messages'}, content_key='text'),
|
| 645 |
+
split=['train', 'test'],
|
| 646 |
+
tags=['rlhf', 'dpo', '🔥']))
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
def repair_conversations(s: Union[str, Any]) -> Any:
|
| 650 |
+
if isinstance(s, str):
|
| 651 |
+
s = s.replace('}\n {', '},{')
|
| 652 |
+
s = s.replace('}\n{', '},{')
|
| 653 |
+
s = s.replace('}{', '},{')
|
| 654 |
+
s = s.replace('}\n {', '},{')
|
| 655 |
+
return ast.literal_eval(s)
|
| 656 |
+
return s
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
register_dataset(
|
| 660 |
+
DatasetMeta(
|
| 661 |
+
ms_dataset_id='AI-ModelScope/lmsys-chat-1m',
|
| 662 |
+
hf_dataset_id='lmsys/lmsys-chat-1m',
|
| 663 |
+
preprocess_func=MessagesPreprocessor(repair_messages=repair_conversations),
|
| 664 |
+
tags=['chat', 'em']))
|
| 665 |
+
|
| 666 |
+
register_dataset(
|
| 667 |
+
DatasetMeta(
|
| 668 |
+
ms_dataset_id='hjh0119/shareAI-Llama3-DPO-zh-en-emoji',
|
| 669 |
+
hf_dataset_id='shareAI/DPO-zh-en-emoji',
|
| 670 |
+
preprocess_func=ResponsePreprocessor(columns={
|
| 671 |
+
'answer_zh': 'response',
|
| 672 |
+
'answer_en': 'rejected_response'
|
| 673 |
+
}),
|
| 674 |
+
tags=['rlhf', 'dpo']))
|
| 675 |
+
|
| 676 |
+
register_dataset(
|
| 677 |
+
DatasetMeta(ms_dataset_id='AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto', tags=['rlhf', 'kto']))
|
| 678 |
+
|
| 679 |
+
register_dataset(
|
| 680 |
+
DatasetMeta(
|
| 681 |
+
ms_dataset_id='OmniData/Zhihu-KOL-More-Than-100-Upvotes',
|
| 682 |
+
hf_dataset_id='bzb2023/Zhihu-KOL-More-Than-100-Upvotes',
|
| 683 |
+
tags=['zhihu', 'qa']))
|
| 684 |
+
|
| 685 |
+
register_dataset(
|
| 686 |
+
DatasetMeta(
|
| 687 |
+
ms_dataset_id='OmniData/Zhihu-KOL',
|
| 688 |
+
hf_dataset_id='wangrui6/Zhihu-KOL',
|
| 689 |
+
huge_dataset=True,
|
| 690 |
+
tags=['zhihu', 'qa'],
|
| 691 |
+
))
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
class GuanacoPreprocessor(RowPreprocessor):
|
| 695 |
+
|
| 696 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 697 |
+
instruction = row['instruction']
|
| 698 |
+
input = row['input']
|
| 699 |
+
output = row['output']
|
| 700 |
+
history = []
|
| 701 |
+
if instruction:
|
| 702 |
+
parts = split_str_parts_by(
|
| 703 |
+
instruction, ['User:', 'User:', 'Assistant:', 'Assistant:', 'Asssistent:', 'Assistent:', 'Assistenz:'])
|
| 704 |
+
for idx, part in enumerate(parts):
|
| 705 |
+
if idx % 2 == 0:
|
| 706 |
+
if 'user' not in part['key'].lower():
|
| 707 |
+
return
|
| 708 |
+
history.append([part['content'], None])
|
| 709 |
+
else:
|
| 710 |
+
if 'assist' not in part['key'].lower() and 'asssist' not in part['key'].lower():
|
| 711 |
+
return
|
| 712 |
+
history[-1][-1] = part['content']
|
| 713 |
+
if input.startswith('User:'):
|
| 714 |
+
input = input[len('User:'):].strip()
|
| 715 |
+
if any([not h[0] or not h[1] for h in history]):
|
| 716 |
+
return
|
| 717 |
+
|
| 718 |
+
messages = []
|
| 719 |
+
for h in history:
|
| 720 |
+
messages.append({'role': 'user', 'content': h[0]})
|
| 721 |
+
messages.append({'role': 'assistant', 'content': h[1]})
|
| 722 |
+
messages.append({'role': 'user', 'content': input})
|
| 723 |
+
messages.append({'role': 'assistant', 'content': output})
|
| 724 |
+
return {
|
| 725 |
+
'messages': messages,
|
| 726 |
+
}
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
register_dataset(
|
| 730 |
+
DatasetMeta(
|
| 731 |
+
ms_dataset_id='AI-ModelScope/GuanacoDataset',
|
| 732 |
+
hf_dataset_id='JosephusCheung/GuanacoDataset',
|
| 733 |
+
preprocess_func=GuanacoPreprocessor(),
|
| 734 |
+
tags=['chat', 'zh']))
|
| 735 |
+
|
| 736 |
+
|
| 737 |
+
class FunctionCallChatmlPreprocessor(MessagesPreprocessor):
|
| 738 |
+
|
| 739 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 740 |
+
res = super().preprocess(row)
|
| 741 |
+
|
| 742 |
+
if res['function_description']:
|
| 743 |
+
res['tools'] = res['function_description'].split('\n\n')
|
| 744 |
+
messages = res['messages']
|
| 745 |
+
if messages[0]['role'] == 'system':
|
| 746 |
+
messages.pop(0)
|
| 747 |
+
return res
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
register_dataset(
|
| 751 |
+
DatasetMeta(
|
| 752 |
+
ms_dataset_id='AI-ModelScope/function-calling-chatml',
|
| 753 |
+
hf_dataset_id='Locutusque/function-calling-chatml',
|
| 754 |
+
preprocess_func=FunctionCallChatmlPreprocessor(),
|
| 755 |
+
tags=['agent', 'en', 'sft', '🔥']))
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
class Dolly15kPreprocessor(RowPreprocessor):
|
| 759 |
+
|
| 760 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 761 |
+
instruction = row['instruction']
|
| 762 |
+
context = row['context']
|
| 763 |
+
response = row['response']
|
| 764 |
+
query = ''
|
| 765 |
+
if context:
|
| 766 |
+
query = 'Here gives some useful information:\n'
|
| 767 |
+
query += context
|
| 768 |
+
query += '\n'
|
| 769 |
+
query += instruction
|
| 770 |
+
return {
|
| 771 |
+
'messages': [{
|
| 772 |
+
'role': 'user',
|
| 773 |
+
'content': query
|
| 774 |
+
}, {
|
| 775 |
+
'role': 'assistant',
|
| 776 |
+
'content': response
|
| 777 |
+
}],
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
register_dataset(
|
| 782 |
+
DatasetMeta(
|
| 783 |
+
ms_dataset_id='AI-ModelScope/databricks-dolly-15k',
|
| 784 |
+
hf_dataset_id='databricks/databricks-dolly-15k',
|
| 785 |
+
preprocess_func=Dolly15kPreprocessor(),
|
| 786 |
+
tags=['multi-task', 'en', 'quality']))
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
class OrpoDPOMix40kPreprocessor(MessagesPreprocessor):
|
| 790 |
+
|
| 791 |
+
def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 792 |
+
if row['source'] == 'toxic-dpo-v0.2':
|
| 793 |
+
return
|
| 794 |
+
return super().preprocess(row)
|
| 795 |
+
|
| 796 |
+
|
| 797 |
+
register_dataset(
|
| 798 |
+
DatasetMeta(
|
| 799 |
+
ms_dataset_id='AI-ModelScope/orpo-dpo-mix-40k',
|
| 800 |
+
hf_dataset_id='mlabonne/orpo-dpo-mix-40k',
|
| 801 |
+
preprocess_func=OrpoDPOMix40kPreprocessor(columns={
|
| 802 |
+
'chosen': 'messages',
|
| 803 |
+
'rejected': 'rejected_messages'
|
| 804 |
+
}),
|
| 805 |
+
tags=['dpo', 'orpo', 'en', 'quality']))
|
| 806 |
+
|
| 807 |
+
register_dataset(
|
| 808 |
+
DatasetMeta(
|
| 809 |
+
ms_dataset_id='swift/sharegpt',
|
| 810 |
+
subsets=['common-zh', 'unknow-zh', 'common-en'],
|
| 811 |
+
tags=['chat', 'general', 'multi-round']))
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
class SelfCognitionPreprocessor(ResponsePreprocessor):
|
| 815 |
+
name: Optional[Tuple[str, str]] = None
|
| 816 |
+
author: Optional[Tuple[str, str]] = None
|
| 817 |
+
|
| 818 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 819 |
+
for key in ['name', 'author']:
|
| 820 |
+
val = getattr(self, key)
|
| 821 |
+
if val is None:
|
| 822 |
+
continue
|
| 823 |
+
val = val[0] if row['tag'] == 'zh' else val[1]
|
| 824 |
+
if val is None:
|
| 825 |
+
continue
|
| 826 |
+
placeholder = '{{' + key.upper() + '}}'
|
| 827 |
+
row['query'] = row['query'].replace(placeholder, val)
|
| 828 |
+
row['response'] = row['response'].replace(placeholder, val)
|
| 829 |
+
return super().preprocess(row)
|
| 830 |
+
|
| 831 |
+
|
| 832 |
+
class Qwen3SelfCognitionPreprocessor(SelfCognitionPreprocessor):
|
| 833 |
+
|
| 834 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 835 |
+
row['query'] = row['query'] + ' /no_think'
|
| 836 |
+
row['response'] = '<think>\n\n</think>\n\n' + row['response']
|
| 837 |
+
return super().preprocess(row)
|
| 838 |
+
|
| 839 |
+
|
| 840 |
+
class EmptyThinkSelfCognitionPreprocessor(SelfCognitionPreprocessor):
|
| 841 |
+
|
| 842 |
+
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 843 |
+
row['response'] = '<think>\n\n</think>\n\n' + row['response']
|
| 844 |
+
return super().preprocess(row)
|
| 845 |
+
|
| 846 |
+
|
| 847 |
+
register_dataset(
|
| 848 |
+
DatasetMeta(
|
| 849 |
+
ms_dataset_id='swift/self-cognition',
|
| 850 |
+
hf_dataset_id='modelscope/self-cognition',
|
| 851 |
+
subsets=[
|
| 852 |
+
SubsetDataset(preprocess_func=SelfCognitionPreprocessor()),
|
| 853 |
+
SubsetDataset('qwen3', preprocess_func=Qwen3SelfCognitionPreprocessor()),
|
| 854 |
+
SubsetDataset('empty_think', preprocess_func=EmptyThinkSelfCognitionPreprocessor()),
|
| 855 |
+
],
|
| 856 |
+
tags=['chat', 'self-cognition', '🔥']))
|
ms-swift/swift/llm/dataset/preprocessor/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (513 Bytes). View file
|
|
|
ms-swift/swift/llm/dataset/preprocessor/__pycache__/core.cpython-310.pyc
ADDED
|
Binary file (18 kB). View file
|
|
|
ms-swift/swift/llm/dataset/register.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
| 2 |
+
import os
|
| 3 |
+
from copy import deepcopy
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
from swift.utils import get_logger, use_hf_hub
|
| 10 |
+
from .preprocessor import DATASET_TYPE, AutoPreprocessor, MessagesPreprocessor
|
| 11 |
+
|
| 12 |
+
PreprocessFunc = Callable[..., DATASET_TYPE]
|
| 13 |
+
LoadFunction = Callable[..., DATASET_TYPE]
|
| 14 |
+
logger = get_logger()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class SubsetDataset:
|
| 19 |
+
# `Name` is used for matching subsets of the dataset, and `subset` refers to the subset_name on the hub.
|
| 20 |
+
name: Optional[str] = None
|
| 21 |
+
# If set to None, then subset is set to subset_name.
|
| 22 |
+
subset: str = 'default'
|
| 23 |
+
|
| 24 |
+
# Higher priority. If set to None, the attributes of the DatasetMeta will be used.
|
| 25 |
+
split: Optional[List[str]] = None
|
| 26 |
+
preprocess_func: Optional[PreprocessFunc] = None
|
| 27 |
+
|
| 28 |
+
# If the dataset specifies "all," weak subsets will be skipped.
|
| 29 |
+
is_weak_subset: bool = False
|
| 30 |
+
|
| 31 |
+
def __post_init__(self):
|
| 32 |
+
if self.name is None:
|
| 33 |
+
self.name = self.subset
|
| 34 |
+
|
| 35 |
+
def set_default(self, dataset_meta: 'DatasetMeta') -> 'SubsetDataset':
|
| 36 |
+
subset_dataset = deepcopy(self)
|
| 37 |
+
for k in ['split', 'preprocess_func']:
|
| 38 |
+
v = getattr(subset_dataset, k)
|
| 39 |
+
if v is None:
|
| 40 |
+
setattr(subset_dataset, k, deepcopy(getattr(dataset_meta, k)))
|
| 41 |
+
return subset_dataset
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class DatasetMeta:
|
| 46 |
+
ms_dataset_id: Optional[str] = None
|
| 47 |
+
hf_dataset_id: Optional[str] = None
|
| 48 |
+
dataset_path: Optional[str] = None
|
| 49 |
+
dataset_name: Optional[str] = None
|
| 50 |
+
ms_revision: Optional[str] = None
|
| 51 |
+
hf_revision: Optional[str] = None
|
| 52 |
+
|
| 53 |
+
subsets: List[Union[SubsetDataset, str]] = field(default_factory=lambda: ['default'])
|
| 54 |
+
# Applicable to all subsets.
|
| 55 |
+
split: List[str] = field(default_factory=lambda: ['train'])
|
| 56 |
+
# First perform column mapping, then proceed with the preprocess_func.
|
| 57 |
+
preprocess_func: PreprocessFunc = field(default_factory=lambda: AutoPreprocessor())
|
| 58 |
+
load_function: Optional[LoadFunction] = None
|
| 59 |
+
|
| 60 |
+
tags: List[str] = field(default_factory=list)
|
| 61 |
+
help: Optional[str] = None
|
| 62 |
+
huge_dataset: bool = False
|
| 63 |
+
|
| 64 |
+
def __post_init__(self):
|
| 65 |
+
from .loader import DatasetLoader
|
| 66 |
+
if self.load_function is None:
|
| 67 |
+
self.load_function = DatasetLoader.load
|
| 68 |
+
for i, subset in enumerate(self.subsets):
|
| 69 |
+
if isinstance(subset, str):
|
| 70 |
+
self.subsets[i] = SubsetDataset(subset=subset)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
DATASET_MAPPING: Dict[Tuple[str, str, str], DatasetMeta] = {}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def get_dataset_list():
|
| 77 |
+
datasets = []
|
| 78 |
+
for key in DATASET_MAPPING:
|
| 79 |
+
if use_hf_hub():
|
| 80 |
+
if key[1]:
|
| 81 |
+
datasets.append(key[1])
|
| 82 |
+
else:
|
| 83 |
+
if key[0]:
|
| 84 |
+
datasets.append(key[0])
|
| 85 |
+
return datasets
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def register_dataset(dataset_meta: DatasetMeta, *, exist_ok: bool = False) -> None:
|
| 89 |
+
"""Register dataset
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
dataset_meta: The `DatasetMeta` info of the dataset.
|
| 93 |
+
exist_ok: If the dataset id exists, raise error or update it.
|
| 94 |
+
"""
|
| 95 |
+
if dataset_meta.dataset_name:
|
| 96 |
+
dataset_name = dataset_meta.dataset_name
|
| 97 |
+
else:
|
| 98 |
+
dataset_name = dataset_meta.ms_dataset_id, dataset_meta.hf_dataset_id, dataset_meta.dataset_path
|
| 99 |
+
if not exist_ok and dataset_name in DATASET_MAPPING:
|
| 100 |
+
raise ValueError(f'The `{dataset_name}` has already been registered in the DATASET_MAPPING.')
|
| 101 |
+
|
| 102 |
+
DATASET_MAPPING[dataset_name] = dataset_meta
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _preprocess_d_info(d_info: Dict[str, Any], *, base_dir: Optional[str] = None) -> Dict[str, Any]:
|
| 106 |
+
d_info = deepcopy(d_info)
|
| 107 |
+
|
| 108 |
+
columns = None
|
| 109 |
+
if 'columns' in d_info:
|
| 110 |
+
columns = d_info.pop('columns')
|
| 111 |
+
|
| 112 |
+
if 'messages' in d_info:
|
| 113 |
+
d_info['preprocess_func'] = MessagesPreprocessor(**d_info.pop('messages'), columns=columns)
|
| 114 |
+
else:
|
| 115 |
+
d_info['preprocess_func'] = AutoPreprocessor(columns=columns)
|
| 116 |
+
|
| 117 |
+
if 'dataset_path' in d_info:
|
| 118 |
+
dataset_path = d_info.pop('dataset_path')
|
| 119 |
+
if base_dir is not None and not os.path.isabs(dataset_path):
|
| 120 |
+
dataset_path = os.path.join(base_dir, dataset_path)
|
| 121 |
+
dataset_path = os.path.abspath(os.path.expanduser(dataset_path))
|
| 122 |
+
|
| 123 |
+
d_info['dataset_path'] = dataset_path
|
| 124 |
+
|
| 125 |
+
if 'subsets' in d_info:
|
| 126 |
+
subsets = d_info.pop('subsets')
|
| 127 |
+
for i, subset in enumerate(subsets):
|
| 128 |
+
if isinstance(subset, dict):
|
| 129 |
+
subsets[i] = SubsetDataset(**_preprocess_d_info(subset))
|
| 130 |
+
d_info['subsets'] = subsets
|
| 131 |
+
return d_info
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _register_d_info(d_info: Dict[str, Any], *, base_dir: Optional[str] = None) -> DatasetMeta:
|
| 135 |
+
"""Register a single dataset to dataset mapping
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
d_info: The dataset info
|
| 139 |
+
"""
|
| 140 |
+
d_info = _preprocess_d_info(d_info, base_dir=base_dir)
|
| 141 |
+
dataset_meta = DatasetMeta(**d_info)
|
| 142 |
+
register_dataset(dataset_meta)
|
| 143 |
+
return dataset_meta
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def register_dataset_info(dataset_info: Union[str, List[str], None] = None) -> List[DatasetMeta]:
|
| 147 |
+
"""Register dataset from the `dataset_info.json` or a custom dataset info file
|
| 148 |
+
This is used to deal with the datasets defined in the json info file.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
dataset_info: The dataset info path
|
| 152 |
+
"""
|
| 153 |
+
# dataset_info_path: path, json or None
|
| 154 |
+
if dataset_info is None:
|
| 155 |
+
dataset_info = os.path.join(os.path.dirname(__file__), 'data', 'dataset_info.json')
|
| 156 |
+
assert isinstance(dataset_info, (str, list))
|
| 157 |
+
base_dir = None
|
| 158 |
+
log_msg = None
|
| 159 |
+
if isinstance(dataset_info, str):
|
| 160 |
+
dataset_path = os.path.abspath(os.path.expanduser(dataset_info))
|
| 161 |
+
if os.path.isfile(dataset_path):
|
| 162 |
+
log_msg = dataset_path
|
| 163 |
+
base_dir = os.path.dirname(dataset_path)
|
| 164 |
+
with open(dataset_path, 'r', encoding='utf-8') as f:
|
| 165 |
+
dataset_info = json.load(f)
|
| 166 |
+
else:
|
| 167 |
+
dataset_info = json.loads(dataset_info) # json
|
| 168 |
+
if len(dataset_info) == 0:
|
| 169 |
+
return []
|
| 170 |
+
res = []
|
| 171 |
+
for d_info in dataset_info:
|
| 172 |
+
res.append(_register_d_info(d_info, base_dir=base_dir))
|
| 173 |
+
|
| 174 |
+
if log_msg is None:
|
| 175 |
+
log_msg = dataset_info if len(dataset_info) < 5 else list(dataset_info.keys())
|
| 176 |
+
logger.info(f'Successfully registered `{log_msg}`.')
|
| 177 |
+
return res
|
ms-swift/swift/llm/ds_config/zero0.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fp16": {
|
| 3 |
+
"enabled": "auto",
|
| 4 |
+
"loss_scale": 0,
|
| 5 |
+
"loss_scale_window": 1000,
|
| 6 |
+
"initial_scale_power": 16,
|
| 7 |
+
"hysteresis": 2,
|
| 8 |
+
"min_loss_scale": 1
|
| 9 |
+
},
|
| 10 |
+
|
| 11 |
+
"bf16": {
|
| 12 |
+
"enabled": "auto"
|
| 13 |
+
},
|
| 14 |
+
|
| 15 |
+
"zero_optimization": {
|
| 16 |
+
"stage": 0,
|
| 17 |
+
"allgather_partitions": true,
|
| 18 |
+
"allgather_bucket_size": 2e8,
|
| 19 |
+
"overlap_comm": false,
|
| 20 |
+
"reduce_scatter": true,
|
| 21 |
+
"reduce_bucket_size": 2e8,
|
| 22 |
+
"contiguous_gradients": true
|
| 23 |
+
},
|
| 24 |
+
|
| 25 |
+
"gradient_accumulation_steps": "auto",
|
| 26 |
+
"gradient_clipping": "auto",
|
| 27 |
+
"steps_per_print": 2000,
|
| 28 |
+
"train_batch_size": "auto",
|
| 29 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 30 |
+
"wall_clock_breakdown": false
|
| 31 |
+
}
|