Student0809 commited on
Commit
fb79a6d
·
verified ·
1 Parent(s): 356aced

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ms-swift/examples/train/multi-gpu/device_map/train.sh +25 -0
  2. ms-swift/examples/train/multimodal/grounding.sh +27 -0
  3. ms-swift/examples/train/multimodal/lora_llm_full_vit/sft.sh +30 -0
  4. ms-swift/examples/train/multimodal/rlhf/dpo.sh +33 -0
  5. ms-swift/examples/train/rlhf/ppo.sh +33 -0
  6. ms-swift/examples/train/seq_cls/qwen2_5/sft.sh +28 -0
  7. ms-swift/examples/train/seq_cls/qwen2_vl/infer.sh +5 -0
  8. ms-swift/examples/train/tuners/adapter/train.sh +16 -0
  9. ms-swift/examples/train/tuners/boft/train.sh +16 -0
  10. ms-swift/examples/train/tuners/dora/train.sh +19 -0
  11. ms-swift/examples/train/tuners/galore/train_galore.sh +18 -0
  12. ms-swift/examples/train/tuners/llamapro/train.sh +17 -0
  13. ms-swift/examples/train/tuners/olora/train.sh +19 -0
  14. ms-swift/examples/train/tuners/pissa/train.sh +19 -0
  15. ms-swift/examples/train/tuners/qlora/train.sh +19 -0
  16. ms-swift/examples/train/tuners/reft/train.sh +17 -0
  17. ms-swift/ms_swift.egg-info/PKG-INFO +545 -0
  18. ms-swift/ms_swift.egg-info/not-zip-safe +1 -0
  19. ms-swift/requirements/install_all.sh +12 -0
  20. ms-swift/requirements/seq_parallel.txt +1 -0
  21. ms-swift/requirements/swanlab.txt +1 -0
  22. ms-swift/scripts/benchmark/config/tuner.json +301 -0
  23. ms-swift/scripts/benchmark/exp.py +50 -0
  24. ms-swift/scripts/benchmark/generate_report.py +433 -0
  25. ms-swift/scripts/utils/run_dataset_info.py +106 -0
  26. ms-swift/scripts/utils/run_template.py +8 -0
  27. ms-swift/swift/__init__.py +55 -0
  28. ms-swift/swift/cli/__init__.py +0 -0
  29. ms-swift/swift/cli/__pycache__/__init__.cpython-310.pyc +0 -0
  30. ms-swift/swift/cli/_megatron/pt.py +4 -0
  31. ms-swift/swift/cli/_megatron/sft.py +4 -0
  32. ms-swift/swift/cli/app.py +4 -0
  33. ms-swift/swift/cli/eval.py +5 -0
  34. ms-swift/swift/cli/export.py +5 -0
  35. ms-swift/swift/cli/main.py +76 -0
  36. ms-swift/swift/cli/pt.py +5 -0
  37. ms-swift/swift/cli/rollout.py +5 -0
  38. ms-swift/swift/hub/__pycache__/__init__.cpython-310.pyc +0 -0
  39. ms-swift/swift/hub/__pycache__/hub.cpython-310.pyc +0 -0
  40. ms-swift/swift/llm/__pycache__/__init__.cpython-310.pyc +0 -0
  41. ms-swift/swift/llm/__pycache__/data_loader.cpython-310.pyc +0 -0
  42. ms-swift/swift/llm/app/__init__.py +1 -0
  43. ms-swift/swift/llm/argument/app_args.py +38 -0
  44. ms-swift/swift/llm/data_loader.py +105 -0
  45. ms-swift/swift/llm/dataset/dataset/__pycache__/mllm.cpython-310.pyc +0 -0
  46. ms-swift/swift/llm/dataset/dataset/llm.py +856 -0
  47. ms-swift/swift/llm/dataset/preprocessor/__pycache__/__init__.cpython-310.pyc +0 -0
  48. ms-swift/swift/llm/dataset/preprocessor/__pycache__/core.cpython-310.pyc +0 -0
  49. ms-swift/swift/llm/dataset/register.py +177 -0
  50. ms-swift/swift/llm/ds_config/zero0.json +31 -0
ms-swift/examples/train/multi-gpu/device_map/train.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 2 * 76GiB
2
+ CUDA_VISIBLE_DEVICES=0,1 \
3
+ MAX_PIXELS=1003520 \
4
+ swift sft \
5
+ --model Qwen/Qwen2.5-VL-72B-Instruct \
6
+ --dataset 'modelscope/coco_2014_caption:validation#20000' \
7
+ --train_type lora \
8
+ --torch_dtype bfloat16 \
9
+ --num_train_epochs 1 \
10
+ --per_device_train_batch_size 1 \
11
+ --per_device_eval_batch_size 1 \
12
+ --learning_rate 1e-4 \
13
+ --lora_rank 8 \
14
+ --lora_alpha 32 \
15
+ --target_modules all-linear \
16
+ --freeze_vit true \
17
+ --gradient_accumulation_steps 16 \
18
+ --eval_steps 100 \
19
+ --save_steps 100 \
20
+ --save_total_limit 2 \
21
+ --logging_steps 5 \
22
+ --max_length 2048 \
23
+ --output_dir output \
24
+ --warmup_ratio 0.05 \
25
+ --dataloader_num_workers 4
ms-swift/examples/train/multimodal/grounding.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 20GiB
2
+ # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
3
+ CUDA_VISIBLE_DEVICES=0 \
4
+ MAX_PIXELS=1003520 \
5
+ swift sft \
6
+ --model Qwen/Qwen2-VL-7B-Instruct \
7
+ --dataset 'AI-ModelScope/coco#20000' \
8
+ --train_type lora \
9
+ --torch_dtype bfloat16 \
10
+ --num_train_epochs 1 \
11
+ --per_device_train_batch_size 1 \
12
+ --per_device_eval_batch_size 1 \
13
+ --learning_rate 1e-4 \
14
+ --lora_rank 8 \
15
+ --lora_alpha 32 \
16
+ --target_modules all-linear \
17
+ --freeze_vit true \
18
+ --gradient_accumulation_steps 16 \
19
+ --eval_steps 100 \
20
+ --save_steps 100 \
21
+ --save_total_limit 2 \
22
+ --logging_steps 5 \
23
+ --max_length 2048 \
24
+ --output_dir output \
25
+ --warmup_ratio 0.05 \
26
+ --dataloader_num_workers 4 \
27
+ --dataset_num_proc 4
ms-swift/examples/train/multimodal/lora_llm_full_vit/sft.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 4 * 22GiB
2
+ # vit/merger lr 1e-5; llm lora lr 1e-4
3
+ NPROC_PER_NODE=4 \
4
+ CUDA_VISIBLE_DEVICES=0,1,2,3 \
5
+ MAX_PIXELS=1003520 \
6
+ swift sft \
7
+ --model Qwen/Qwen2.5-VL-7B-Instruct \
8
+ --dataset 'AI-ModelScope/coco#20000' \
9
+ --train_type custom \
10
+ --optimizer custom \
11
+ --external_plugins 'examples/train/multimodal/lora_llm_full_vit/custom_plugin.py' \
12
+ --torch_dtype bfloat16 \
13
+ --num_train_epochs 1 \
14
+ --per_device_train_batch_size 1 \
15
+ --per_device_eval_batch_size 1 \
16
+ --learning_rate 1e-4 \
17
+ --lora_rank 16 \
18
+ --lora_alpha 32 \
19
+ --gradient_accumulation_steps 4 \
20
+ --eval_steps 100 \
21
+ --save_steps 100 \
22
+ --save_total_limit 2 \
23
+ --logging_steps 5 \
24
+ --max_length 8192 \
25
+ --output_dir output \
26
+ --warmup_ratio 0.05 \
27
+ --dataloader_num_workers 4 \
28
+ --dataset_num_proc 4 \
29
+ --deepspeed zero2 \
30
+ --save_only_model true
ms-swift/examples/train/multimodal/rlhf/dpo.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 4*50GiB
2
+ # You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
3
+ # --rlhf_type cpo/orpo/simpo/rm are also supported
4
+ nproc_per_node=2
5
+
6
+ CUDA_VISIBLE_DEVICES=0,1 \
7
+ NPROC_PER_NODE=$nproc_per_node \
8
+ MAX_PIXELS=1003520 \
9
+ swift rlhf \
10
+ --rlhf_type dpo \
11
+ --model Qwen/Qwen2.5-VL-7B-Instruct \
12
+ --dataset 'swift/RLAIF-V-Dataset#20000' \
13
+ --train_type lora \
14
+ --torch_dtype bfloat16 \
15
+ --num_train_epochs 1 \
16
+ --per_device_train_batch_size 1 \
17
+ --per_device_eval_batch_size 1 \
18
+ --learning_rate 1e-4 \
19
+ --lora_rank 8 \
20
+ --lora_alpha 32 \
21
+ --target_modules all-linear \
22
+ --freeze_vit true \
23
+ --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
24
+ --eval_steps 100 \
25
+ --save_steps 100 \
26
+ --save_total_limit 2 \
27
+ --deepspeed zero2 \
28
+ --logging_steps 5 \
29
+ --max_length 4096 \
30
+ --output_dir output \
31
+ --warmup_ratio 0.05 \
32
+ --dataloader_num_workers 4 \
33
+ --dataset_num_proc 4
ms-swift/examples/train/rlhf/ppo.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Currently, it only supports the case where the model and reward_model use the same template/tokenizer.
2
+ # Currently, multimodal model PPO is not supported.
3
+ nproc_per_node=4
4
+
5
+ CUDA_VISIBLE_DEVICES=0,1,2,3 \
6
+ NPROC_PER_NODE=$nproc_per_node \
7
+ swift rlhf \
8
+ --rlhf_type ppo \
9
+ --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
10
+ --reward_model 'AI-ModelScope/Skywork-Reward-Llama-3.1-8B-v0.2' \
11
+ --train_type lora \
12
+ --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' 'AI-ModelScope/alpaca-gpt4-data-en#20000' \
13
+ --torch_dtype bfloat16 \
14
+ --num_train_epochs 1 \
15
+ --per_device_train_batch_size 1 \
16
+ --per_device_eval_batch_size 1 \
17
+ --learning_rate 1e-5 \
18
+ --lora_rank 8 \
19
+ --lora_alpha 32 \
20
+ --target_modules all-linear \
21
+ --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
22
+ --eval_steps 100 \
23
+ --save_steps 100 \
24
+ --save_total_limit 2 \
25
+ --logging_steps 5 \
26
+ --max_length 2048 \
27
+ --output_dir output \
28
+ --warmup_ratio 0.05 \
29
+ --dataloader_num_workers 4 \
30
+ --deepspeed zero2 \
31
+ --response_length 512 \
32
+ --temperature 0.7 \
33
+ --dataset_num_proc 4
ms-swift/examples/train/seq_cls/qwen2_5/sft.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # If `num_labels` is provided, it will be considered a classification task,
2
+ # and AutoModelForSequenceClassification will be used to load the model.
3
+ # You can also specify `--model Qwen/Qwen2.5-0.5B-Instruct --use_chat_template true`.
4
+ CUDA_VISIBLE_DEVICES=0 \
5
+ swift sft \
6
+ --model Qwen/Qwen2.5-0.5B \
7
+ --train_type lora \
8
+ --dataset 'DAMO_NLP/jd:cls#2000' \
9
+ --torch_dtype bfloat16 \
10
+ --num_train_epochs 1 \
11
+ --per_device_train_batch_size 1 \
12
+ --per_device_eval_batch_size 1 \
13
+ --learning_rate 1e-4 \
14
+ --lora_rank 8 \
15
+ --lora_alpha 32 \
16
+ --target_modules all-linear \
17
+ --gradient_accumulation_steps 16 \
18
+ --eval_steps 50 \
19
+ --save_steps 50 \
20
+ --save_total_limit 2 \
21
+ --logging_steps 5 \
22
+ --max_length 2048 \
23
+ --output_dir output \
24
+ --warmup_ratio 0.05 \
25
+ --dataloader_num_workers 4 \
26
+ --num_labels 2 \
27
+ --task_type seq_cls \
28
+ --use_chat_template false
ms-swift/examples/train/seq_cls/qwen2_vl/infer.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ CUDA_VISIBLE_DEVICES=0 \
2
+ MAX_PIXELS=1003520 \
3
+ swift infer \
4
+ --adapters output/vx-xxx/checkpoint-xxx \
5
+ --load_data_args true
ms-swift/examples/train/tuners/adapter/train.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 17GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type adapter \
6
+ --dataset 'swift/self-cognition#1000' \
7
+ --num_train_epochs 1 \
8
+ --per_device_train_batch_size 1 \
9
+ --learning_rate 1e-4 \
10
+ --gradient_accumulation_steps 16 \
11
+ --eval_steps 100 \
12
+ --save_steps 100 \
13
+ --save_total_limit 2 \
14
+ --logging_steps 5 \
15
+ --model_author swift \
16
+ --model_name swift-robot
ms-swift/examples/train/tuners/boft/train.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 17GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type boft \
6
+ --dataset 'swift/self-cognition#1000' \
7
+ --num_train_epochs 1 \
8
+ --per_device_train_batch_size 1 \
9
+ --learning_rate 1e-4 \
10
+ --gradient_accumulation_steps 16 \
11
+ --eval_steps 100 \
12
+ --save_steps 100 \
13
+ --save_total_limit 2 \
14
+ --logging_steps 5 \
15
+ --model_author swift \
16
+ --model_name swift-robot
ms-swift/examples/train/tuners/dora/train.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 17.2GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type lora \
6
+ --use_dora true \
7
+ --dataset 'swift/self-cognition#1000' \
8
+ --num_train_epochs 1 \
9
+ --per_device_train_batch_size 1 \
10
+ --learning_rate 1e-4 \
11
+ --lora_rank 8 \
12
+ --lora_alpha 32 \
13
+ --gradient_accumulation_steps 16 \
14
+ --eval_steps 100 \
15
+ --save_steps 100 \
16
+ --save_total_limit 2 \
17
+ --logging_steps 5 \
18
+ --model_author swift \
19
+ --model_name swift-robot
ms-swift/examples/train/tuners/galore/train_galore.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 38GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type full \
6
+ --dataset 'swift/self-cognition#1000' \
7
+ --num_train_epochs 1 \
8
+ --per_device_train_batch_size 1 \
9
+ --learning_rate 1e-5 \
10
+ --gradient_accumulation_steps 16 \
11
+ --eval_steps 100 \
12
+ --save_steps 100 \
13
+ --save_total_limit 2 \
14
+ --logging_steps 5 \
15
+ --model_author swift \
16
+ --model_name swift-robot \
17
+ --use_galore true \
18
+ --galore_optim_per_parameter true
ms-swift/examples/train/tuners/llamapro/train.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 25.4GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type llamapro \
6
+ --dataset 'swift/self-cognition#1000' \
7
+ --llamapro_num_new_blocks 4 \
8
+ --num_train_epochs 1 \
9
+ --per_device_train_batch_size 1 \
10
+ --learning_rate 1e-4 \
11
+ --gradient_accumulation_steps 16 \
12
+ --eval_steps 100 \
13
+ --save_steps 100 \
14
+ --save_total_limit 2 \
15
+ --logging_steps 5 \
16
+ --model_author swift \
17
+ --model_name swift-robot
ms-swift/examples/train/tuners/olora/train.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 17GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type lora \
6
+ --dataset 'swift/self-cognition#1000' \
7
+ --num_train_epochs 1 \
8
+ --per_device_train_batch_size 1 \
9
+ --learning_rate 1e-4 \
10
+ --lora_rank 8 \
11
+ --lora_alpha 32 \
12
+ --init_lora_weights olora \
13
+ --gradient_accumulation_steps 16 \
14
+ --eval_steps 100 \
15
+ --save_steps 100 \
16
+ --save_total_limit 2 \
17
+ --logging_steps 5 \
18
+ --model_author swift \
19
+ --model_name swift-robot
ms-swift/examples/train/tuners/pissa/train.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 17GiB
2
+ CUDA_VISIBLE_DEVICES=0 \
3
+ swift sft \
4
+ --model Qwen/Qwen2.5-7B-Instruct \
5
+ --train_type lora \
6
+ --dataset 'swift/self-cognition#1000' \
7
+ --num_train_epochs 1 \
8
+ --per_device_train_batch_size 1 \
9
+ --learning_rate 1e-4 \
10
+ --lora_rank 8 \
11
+ --lora_alpha 32 \
12
+ --init_lora_weights pissa \
13
+ --gradient_accumulation_steps 16 \
14
+ --eval_steps 100 \
15
+ --save_steps 100 \
16
+ --save_total_limit 2 \
17
+ --logging_steps 5 \
18
+ --model_author swift \
19
+ --model_name swift-robot
ms-swift/examples/train/tuners/qlora/train.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUDA_VISIBLE_DEVICES=0 \
2
+ swift sft \
3
+ --model Qwen/Qwen2.5-7B-Instruct \
4
+ --train_type lora \
5
+ --dataset 'swift/self-cognition#1000' \
6
+ --num_train_epochs 1 \
7
+ --per_device_train_batch_size 1 \
8
+ --learning_rate 1e-4 \
9
+ --lora_rank 8 \
10
+ --lora_alpha 32 \
11
+ --gradient_accumulation_steps 16 \
12
+ --eval_steps 100 \
13
+ --save_steps 100 \
14
+ --save_total_limit 2 \
15
+ --logging_steps 5 \
16
+ --model_author swift \
17
+ --model_name swift-robot \
18
+ --quant_bits 4 \
19
+ --quant_method bnb
ms-swift/examples/train/tuners/reft/train.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUDA_VISIBLE_DEVICES=0 \
2
+ swift sft \
3
+ --model Qwen/Qwen2.5-7B-Instruct \
4
+ --train_type reft \
5
+ --dataset 'swift/self-cognition#1000' \
6
+ --reft_intervention_type 'LoreftIntervention' \
7
+ --num_train_epochs 1 \
8
+ --per_device_train_batch_size 1 \
9
+ --learning_rate 1e-4 \
10
+ --gradient_checkpointing false \
11
+ --gradient_accumulation_steps 16 \
12
+ --eval_steps 100 \
13
+ --save_steps 100 \
14
+ --save_total_limit 2 \
15
+ --logging_steps 5 \
16
+ --model_author swift \
17
+ --model_name swift-robot
ms-swift/ms_swift.egg-info/PKG-INFO ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: ms_swift
3
+ Version: 3.5.0.dev0
4
+ Summary: Swift: Scalable lightWeight Infrastructure for Fine-Tuning
5
+ Home-page: https://github.com/modelscope/swift
6
+ Author: DAMO ModelScope teams
7
+ Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
9
+ Keywords: python,petl,efficient tuners
10
+ Platform: UNKNOWN
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: accelerate
23
+ Requires-Dist: addict
24
+ Requires-Dist: aiohttp
25
+ Requires-Dist: attrdict
26
+ Requires-Dist: binpacking
27
+ Requires-Dist: charset_normalizer
28
+ Requires-Dist: cpm_kernels
29
+ Requires-Dist: dacite
30
+ Requires-Dist: datasets<3.4,>=3.0
31
+ Requires-Dist: einops
32
+ Requires-Dist: fastapi
33
+ Requires-Dist: gradio>=3.40.0
34
+ Requires-Dist: importlib_metadata
35
+ Requires-Dist: jieba
36
+ Requires-Dist: matplotlib
37
+ Requires-Dist: modelscope>=1.23
38
+ Requires-Dist: nltk
39
+ Requires-Dist: numpy<2.0
40
+ Requires-Dist: openai
41
+ Requires-Dist: oss2
42
+ Requires-Dist: pandas
43
+ Requires-Dist: peft<0.16,>=0.11
44
+ Requires-Dist: pillow
45
+ Requires-Dist: requests
46
+ Requires-Dist: rouge
47
+ Requires-Dist: safetensors
48
+ Requires-Dist: scipy
49
+ Requires-Dist: sentencepiece
50
+ Requires-Dist: simplejson>=3.3.0
51
+ Requires-Dist: sortedcontainers>=1.5.9
52
+ Requires-Dist: tensorboard
53
+ Requires-Dist: tiktoken
54
+ Requires-Dist: tqdm
55
+ Requires-Dist: transformers<4.53,>=4.33
56
+ Requires-Dist: transformers_stream_generator
57
+ Requires-Dist: trl<0.18,>=0.13
58
+ Requires-Dist: uvicorn
59
+ Requires-Dist: zstandard
60
+ Provides-Extra: eval
61
+ Requires-Dist: evalscope[opencompass]; extra == "eval"
62
+ Requires-Dist: evalscope[vlmeval]; extra == "eval"
63
+ Provides-Extra: swanlab
64
+ Requires-Dist: swanlab; extra == "swanlab"
65
+ Provides-Extra: seq-parallel
66
+ Requires-Dist: xtuner; extra == "seq-parallel"
67
+ Provides-Extra: all
68
+ Requires-Dist: accelerate; extra == "all"
69
+ Requires-Dist: addict; extra == "all"
70
+ Requires-Dist: aiohttp; extra == "all"
71
+ Requires-Dist: attrdict; extra == "all"
72
+ Requires-Dist: binpacking; extra == "all"
73
+ Requires-Dist: charset_normalizer; extra == "all"
74
+ Requires-Dist: cpm_kernels; extra == "all"
75
+ Requires-Dist: dacite; extra == "all"
76
+ Requires-Dist: datasets<3.4,>=3.0; extra == "all"
77
+ Requires-Dist: einops; extra == "all"
78
+ Requires-Dist: fastapi; extra == "all"
79
+ Requires-Dist: gradio>=3.40.0; extra == "all"
80
+ Requires-Dist: importlib_metadata; extra == "all"
81
+ Requires-Dist: jieba; extra == "all"
82
+ Requires-Dist: matplotlib; extra == "all"
83
+ Requires-Dist: modelscope>=1.23; extra == "all"
84
+ Requires-Dist: nltk; extra == "all"
85
+ Requires-Dist: numpy<2.0; extra == "all"
86
+ Requires-Dist: openai; extra == "all"
87
+ Requires-Dist: oss2; extra == "all"
88
+ Requires-Dist: pandas; extra == "all"
89
+ Requires-Dist: peft<0.16,>=0.11; extra == "all"
90
+ Requires-Dist: pillow; extra == "all"
91
+ Requires-Dist: requests; extra == "all"
92
+ Requires-Dist: rouge; extra == "all"
93
+ Requires-Dist: safetensors; extra == "all"
94
+ Requires-Dist: scipy; extra == "all"
95
+ Requires-Dist: sentencepiece; extra == "all"
96
+ Requires-Dist: simplejson>=3.3.0; extra == "all"
97
+ Requires-Dist: sortedcontainers>=1.5.9; extra == "all"
98
+ Requires-Dist: tensorboard; extra == "all"
99
+ Requires-Dist: tiktoken; extra == "all"
100
+ Requires-Dist: tqdm; extra == "all"
101
+ Requires-Dist: transformers<4.53,>=4.33; extra == "all"
102
+ Requires-Dist: transformers_stream_generator; extra == "all"
103
+ Requires-Dist: trl<0.18,>=0.13; extra == "all"
104
+ Requires-Dist: uvicorn; extra == "all"
105
+ Requires-Dist: zstandard; extra == "all"
106
+ Requires-Dist: evalscope[opencompass]; extra == "all"
107
+ Requires-Dist: evalscope[vlmeval]; extra == "all"
108
+ Requires-Dist: xtuner; extra == "all"
109
+ Requires-Dist: swanlab; extra == "all"
110
+ Dynamic: author
111
+ Dynamic: author-email
112
+ Dynamic: classifier
113
+ Dynamic: description
114
+ Dynamic: description-content-type
115
+ Dynamic: home-page
116
+ Dynamic: keywords
117
+ Dynamic: license
118
+ Dynamic: license-file
119
+ Dynamic: provides-extra
120
+ Dynamic: requires-dist
121
+ Dynamic: summary
122
+
123
+ # SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
124
+
125
+ <p align="center">
126
+ <br>
127
+ <img src="asset/banner.png"/>
128
+ <br>
129
+ <p>
130
+ <p align="center">
131
+ <a href="https://modelscope.cn/home">ModelScope Community Website</a>
132
+ <br>
133
+ <a href="README_CN.md">中文</a> &nbsp | &nbsp English &nbsp
134
+ </p>
135
+
136
+ <p align="center">
137
+ <img src="https://img.shields.io/badge/python-3.10-5be.svg">
138
+ <img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
139
+ <a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.19-5D91D4.svg"></a>
140
+ <a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
141
+ <a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
142
+ <a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
143
+ <a href="https://github.com/modelscope/swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
144
+ </p>
145
+
146
+ <p align="center">
147
+ <a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
148
+ </p>
149
+
150
+ <p align="center">
151
+ <a href="https://arxiv.org/abs/2408.05517">Paper</a> &nbsp | <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a> &nbsp | &nbsp <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp
152
+ </p>
153
+
154
+ ## 📖 Table of Contents
155
+ - [Groups](#-Groups)
156
+ - [Introduction](#-introduction)
157
+ - [News](#-news)
158
+ - [Installation](#%EF%B8%8F-installation)
159
+ - [Quick Start](#-quick-Start)
160
+ - [Usage](#-Usage)
161
+ - [License](#-License)
162
+ - [Citation](#-citation)
163
+
164
+
165
+ ## ☎ Groups
166
+
167
+ You can contact us and communicate with us by adding our group:
168
+
169
+
170
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group
171
+ :-------------------------:|:-------------------------:
172
+ <img src="asset/discord_qr.jpg" width="200" height="200"> | <img src="asset/wechat.png" width="200" height="200">
173
+
174
+
175
+ ## 📝 Introduction
176
+ 🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 500+ large models and 200+ multi-modal large models. These large language models (LLMs) include models such as Qwen3, Qwen3-MoE, Qwen2.5, InternLM3, GLM4, Mistral, DeepSeek-R1, Yi1.5, TeleChat2, Baichuan2, and Gemma2. The multi-modal LLMs include models such as Qwen2.5-VL, Qwen2-Audio, Llama3.4, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2.
177
+
178
+ 🍔 Additionally, ms-swift incorporates the latest training technologies, including lightweight techniques such as LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger, as well as human alignment training methods like DPO, GRPO, RM, PPO, KTO, CPO, SimPO, and ORPO. ms-swift supports acceleration of inference, evaluation, and deployment modules using vLLM and LMDeploy, and it supports model quantization with technologies like GPTQ, AWQ, and BNB. Furthermore, ms-swift offers a Gradio-based Web UI and a wealth of best practices.
179
+
180
+ **Why choose ms-swift?**
181
+
182
+ - 🍎 **Model Types**: Supports 500+ pure text large models, **200+ multi-modal large models**, as well as All-to-All multi-modal models, sequence classification models, and embedding models, **covering the entire process from training to deployment**.
183
+ - **Dataset Types**: Comes with 150+ pre-training, fine-tuning, human alignment, multi-modal datasets, and supports custom datasets.
184
+ - **Hardware Support**: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, MPS, etc.
185
+ - 🍊 **Lightweight Training**: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel.
186
+ - **Distributed Training**: Supports distributed data parallel (DDP), device_map simple model parallelism, DeepSpeed ZeRO2/ZeRO3, FSDP, and other distributed training techniques.
187
+ - **Quantization Training**: Supports training quantized models like BNB, AWQ, GPTQ, AQLM, HQQ, EETQ.
188
+ - **RLHF Training**: Supports human alignment training methods such as DPO, GRPO, RM, PPO, KTO, CPO, SimPO, ORPO for both pure text and multi-modal large models.
189
+ - 🍓 **Multi-Modal Training**: Supports training on different modalities like images, videos, and audio, for tasks like VQA, captioning, OCR, and grounding.
190
+ - **Interface Training**: Provides capabilities for training, inference, evaluation, quantization through an interface, completing the whole large model pipeline.
191
+ - **Plugin and Extension**: Supports custom model and dataset extensions, as well as customization of components like loss, metric, trainer, loss-scale, callback, optimizer.
192
+ - 🍉 **Toolbox Capabilities**: Offers not only training support for large models and multi-modal large models but also covers the entire process of inference, evaluation, quantization, and deployment.
193
+ - **Inference Acceleration**: Supports inference acceleration engines like PyTorch, vLLM, LmDeploy, and provides OpenAI API for accelerating inference, deployment, and evaluation modules.
194
+ - **Model Evaluation**: Uses EvalScope as the evaluation backend and supports evaluation on 100+ datasets for both pure text and multi-modal models.
195
+ - **Model Quantization**: Supports AWQ, GPTQ, and BNB quantized exports, with models that can use vLLM/LmDeploy for inference acceleration and continue training.
196
+
197
+
198
+ ## 🎉 News
199
+ - ��� 2025.05.11: GRPO now supports custom processing logic for reward models. See the GenRM example [here](./docs/source_en/Instruction/GRPO.md#customized-reward-models) .
200
+ - 🎁 2025.04.15: The ms-swift paper has been accepted by AAAI 2025. You can find the paper at [this link](https://ojs.aaai.org/index.php/AAAI/article/view/35383).
201
+ - 🎁 2025.03.23: Multi-round GRPO is now supported for training multi-turn dialogue scenarios (e.g., agent tool calling). Please refer to the [training script](https://idealab.alibaba-inc.com/examples/train/grpo/internal/train_multi_round.sh).
202
+ - 🎁 2025.03.16: Support for Megatron's parallel training techniques is now available. Please see the [Megatron-SWIFT training documentation](https://swift.readthedocs.io/zh-cn/latest/Instruction/Megatron-SWIFT训练.html).
203
+ - 🎁 2025.03.15: Fine-tuning of embedding models for both pure text and multimodal models is supported. Please check the [training script](https://idealab.alibaba-inc.com/examples/train/embedding).
204
+ - 🎁 2025.03.05: The hybrid mode for GRPO is supported, with a script for training a 72B model on 4 GPUs (4*80G) available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/train_72b_4gpu.sh). Tensor parallelism with vllm is also supported, with the training script available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/multi_gpu_mp_colocate.sh).
205
+ - 🎁 2025.02.21: The GRPO algorithm now supports LMDeploy, with the training script available [here](https://idealab.alibaba-inc.com/examples/train/grpo/internal/full_lmdeploy.sh). Additionally, the performance of the GRPO algorithm has been tested, achieving a training speed increase of up to 300% using various tricks. Please check the WanDB table [here](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz).
206
+ - 🎁 2025.02.21: The `swift sample` command is now supported. The reinforcement fine-tuning script can be found [here](https://idealab.alibaba-inc.com/docs/source/Instruction/强化微调.md), and the large model API distillation sampling script is available [here](https://idealab.alibaba-inc.com/examples/sampler/distill/distill.sh).
207
+ - 🔥 2025.02.12: Support for the GRPO (Group Relative Policy Optimization) training algorithm has been added. Documentation is available [here](https://idealab.alibaba-inc.com/docs/source/Instruction/GRPO.md).
208
+ - 🎁 2024.12.04: Major update to **ms-swift 3.0**. Please refer to the [release notes and changes](https://swift.readthedocs.io/zh-cn/latest/Instruction/ReleaseNote3.0.html).
209
+ <details><summary>More</summary>
210
+
211
+ - 🎉 2024.08.12: The ms-swift paper has been published on arXiv and can be read [here](https://arxiv.org/abs/2408.05517).
212
+ - 🔥 2024.08.05: Support for using [evalscope](https://github.com/modelscope/evalscope/) as a backend for evaluating large models and multimodal models.
213
+ - 🔥 2024.07.29: Support for using [vllm](https://github.com/vllm-project/vllm) and [lmdeploy](https://github.com/InternLM/lmdeploy) to accelerate inference for large models and multimodal models. When performing infer/deploy/eval, you can specify `--infer_backend vllm/lmdeploy`.
214
+ - 🔥 2024.07.24: Support for human preference alignment training for multimodal large models, including DPO/ORPO/SimPO/CPO/KTO/RM/PPO.
215
+ - 🔥 2024.02.01: Support for Agent training! The training algorithm is derived from [this paper](https://arxiv.org/pdf/2309.00986.pdf).
216
+ </details>
217
+
218
+ ## 🛠️ Installation
219
+ To install using pip:
220
+ ```shell
221
+ pip install ms-swift -U
222
+ ```
223
+
224
+ To install from source:
225
+ ```shell
226
+ # pip install git+https://github.com/modelscope/ms-swift.git
227
+
228
+ git clone https://github.com/modelscope/ms-swift.git
229
+ cd ms-swift
230
+ pip install -e .
231
+ ```
232
+
233
+ Running Environment:
234
+
235
+ | | Range | Recommended | Notes |
236
+ | ------------ |--------------| ----------- | ----------------------------------------- |
237
+ | python | >=3.9 | 3.10 | |
238
+ | cuda | | cuda12 | No need to install if using CPU, NPU, MPS |
239
+ | torch | >=2.0 | | |
240
+ | transformers | >=4.33 | 4.51 | |
241
+ | modelscope | >=1.23 | | |
242
+ | peft | >=0.11,<0.16 | ||
243
+ | trl | >=0.13,<0.18 | 0.17 |RLHF|
244
+ | deepspeed | >=0.14 | 0.14.5 | Training |
245
+ | vllm | >=0.5.1 | 0.7.3/0.8 | Inference/Deployment/Evaluation |
246
+ | lmdeploy | >=0.5 | 0.8 | Inference/Deployment/Evaluation |
247
+ | evalscope | >=0.11 | | Evaluation |
248
+
249
+ For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
250
+
251
+
252
+ ## 🚀 Quick Start
253
+
254
+ 10 minutes of self-cognition fine-tuning of Qwen2.5-7B-Instruct on a single 3090 GPU:
255
+
256
+ ### Command Line Interface
257
+
258
+ ```shell
259
+ # 22GB
260
+ CUDA_VISIBLE_DEVICES=0 \
261
+ swift sft \
262
+ --model Qwen/Qwen2.5-7B-Instruct \
263
+ --train_type lora \
264
+ --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
265
+ 'AI-ModelScope/alpaca-gpt4-data-en#500' \
266
+ 'swift/self-cognition#500' \
267
+ --torch_dtype bfloat16 \
268
+ --num_train_epochs 1 \
269
+ --per_device_train_batch_size 1 \
270
+ --per_device_eval_batch_size 1 \
271
+ --learning_rate 1e-4 \
272
+ --lora_rank 8 \
273
+ --lora_alpha 32 \
274
+ --target_modules all-linear \
275
+ --gradient_accumulation_steps 16 \
276
+ --eval_steps 50 \
277
+ --save_steps 50 \
278
+ --save_total_limit 2 \
279
+ --logging_steps 5 \
280
+ --max_length 2048 \
281
+ --output_dir output \
282
+ --system 'You are a helpful assistant.' \
283
+ --warmup_ratio 0.05 \
284
+ --dataloader_num_workers 4 \
285
+ --model_author swift \
286
+ --model_name swift-robot
287
+ ```
288
+
289
+ Tips:
290
+
291
+ - If you want to train with a custom dataset, you can refer to [this guide](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) to organize your dataset format and specify `--dataset <dataset_path>`.
292
+ - The `--model_author` and `--model_name` parameters are only effective when the dataset includes `swift/self-cognition`.
293
+ - To train with a different model, simply modify `--model <model_id/model_path>`.
294
+ - By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
295
+
296
+ After training is complete, use the following command to infer with the trained weights:
297
+
298
+ - Here, `--adapters` should be replaced with the last checkpoint folder generated during training. Since the adapters folder contains the training parameter file `args.json`, there is no need to specify `--model`, `--system` separately; Swift will automatically read these parameters. To disable this behavior, you can set `--load_args false`.
299
+
300
+ ```shell
301
+ # Using an interactive command line for inference.
302
+ CUDA_VISIBLE_DEVICES=0 \
303
+ swift infer \
304
+ --adapters output/vx-xxx/checkpoint-xxx \
305
+ --stream true \
306
+ --temperature 0 \
307
+ --max_new_tokens 2048
308
+
309
+ # merge-lora and use vLLM for inference acceleration
310
+ CUDA_VISIBLE_DEVICES=0 \
311
+ swift infer \
312
+ --adapters output/vx-xxx/checkpoint-xxx \
313
+ --stream true \
314
+ --merge_lora true \
315
+ --infer_backend vllm \
316
+ --max_model_len 8192 \
317
+ --temperature 0 \
318
+ --max_new_tokens 2048
319
+ ```
320
+
321
+ Finally, use the following command to push the model to ModelScope:
322
+
323
+ ```shell
324
+ CUDA_VISIBLE_DEVICES=0 \
325
+ swift export \
326
+ --adapters output/vx-xxx/checkpoint-xxx \
327
+ --push_to_hub true \
328
+ --hub_model_id '<your-model-id>' \
329
+ --hub_token '<your-sdk-token>' \
330
+ --use_hf false
331
+ ```
332
+
333
+
334
+ ### Web-UI
335
+ The Web-UI is a **zero-threshold** training and deployment interface solution based on Gradio interface technology. For more details, you can check [here](https://swift.readthedocs.io/en/latest/GetStarted/Web-UI.html).
336
+
337
+ ```shell
338
+ SWIFT_UI_LANG=en swift web-ui
339
+ ```
340
+
341
+ ![image.png](./docs/resources/web-ui-en.jpg)
342
+
343
+ ### Using Python
344
+
345
+ ms-swift also supports training and inference using Python. Below is pseudocode for training and inference. For more details, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb).
346
+
347
+ Training:
348
+
349
+ ```python
350
+ # Retrieve the model and template, and add a trainable LoRA module
351
+ model, tokenizer = get_model_tokenizer(model_id_or_path, ...)
352
+ template = get_template(model.model_meta.template, tokenizer, ...)
353
+ model = Swift.prepare_model(model, lora_config)
354
+
355
+ # Download and load the dataset, and encode the text into tokens
356
+ train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
357
+ train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
358
+ val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
359
+
360
+ # Train the model
361
+ trainer = Seq2SeqTrainer(
362
+ model=model,
363
+ args=training_args,
364
+ data_collator=template.data_collator,
365
+ train_dataset=train_dataset,
366
+ eval_dataset=val_dataset,
367
+ template=template,
368
+ )
369
+ trainer.train()
370
+ ```
371
+ Inference:
372
+
373
+ ```python
374
+ # Perform inference using the native PyTorch engine
375
+ engine = PtEngine(model_id_or_path, adapters=[lora_checkpoint])
376
+ infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
377
+ request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
378
+
379
+ resp_list = engine.infer([infer_request], request_config)
380
+ print(f'response: {resp_list[0].choices[0].message.content}')
381
+ ```
382
+
383
+ ## ✨ Usage
384
+ Here is a minimal example of training to deployment using ms-swift. For more details, you can check the [examples](https://github.com/modelscope/ms-swift/tree/main/examples).
385
+
386
+ - If you want to use other models or datasets (including multimodal models and datasets), you only need to modify `--model` to specify the corresponding model's ID or path, and modify `--dataset` to specify the corresponding dataset's ID or path.
387
+ - By default, ModelScope is used for downloading models and datasets. If you want to use HuggingFace, simply specify `--use_hf true`.
388
+
389
+ | Useful Links |
390
+ | ------ |
391
+ | [🔥Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html) |
392
+ | [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html) |
393
+ | [Custom Models](https://swift.readthedocs.io/en/latest/Customization/Custom-model.html), [🔥Custom Datasets](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html) |
394
+ | [LLM Tutorial](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial) |
395
+
396
+ ### Training
397
+
398
+ Supported Training Methods:
399
+
400
+ | Method | Full-Parameter | LoRA | QLoRA | Deepspeed | Multi-Node | Multi-Modal |
401
+ |------------------------------------|--------------------------------------------------------------|---------------------------------------------------------------------------------------------|--------------------------------------------------------------|--------------------------------------------------------------|--------------------------------------------------------------|----------------------------------------------------------------------------------------------|
402
+ | Pre-training | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain/train.sh) | ✅ | ✅ | ✅ | ✅ | ✅ |
403
+ | Instruction Supervised Fine-tuning | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) |
404
+ | DPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo.sh) |
405
+ | GRPO Training | [✅]((https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/grpo_zero2.sh)) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/multi_node) | ✅ |
406
+ | Reward Model Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅ |
407
+ | PPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅ | ❌ |
408
+ | KTO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh) |
409
+ | CPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅ |
410
+ | SimPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅ |
411
+ | ORPO Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅ |
412
+ | Classification Model Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_5/sft.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_vl/sft.sh) |
413
+ | Embedding Model Training | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gte.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh) |
414
+
415
+
416
+
417
+ Pre-training:
418
+ ```shell
419
+ # 8*A100
420
+ NPROC_PER_NODE=8 \
421
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
422
+ swift pt \
423
+ --model Qwen/Qwen2.5-7B \
424
+ --dataset swift/chinese-c4 \
425
+ --streaming true \
426
+ --train_type full \
427
+ --deepspeed zero2 \
428
+ --output_dir output \
429
+ --max_steps 10000 \
430
+ ...
431
+ ```
432
+
433
+ Fine-tuning:
434
+ ```shell
435
+ CUDA_VISIBLE_DEVICES=0 swift sft \
436
+ --model Qwen/Qwen2.5-7B-Instruct \
437
+ --dataset AI-ModelScope/alpaca-gpt4-data-en \
438
+ --train_type lora \
439
+ --output_dir output \
440
+ ...
441
+ ```
442
+
443
+ RLHF:
444
+ ```shell
445
+ CUDA_VISIBLE_DEVICES=0 swift rlhf \
446
+ --rlhf_type dpo \
447
+ --model Qwen/Qwen2.5-7B-Instruct \
448
+ --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
449
+ --train_type lora \
450
+ --output_dir output \
451
+ ...
452
+ ```
453
+
454
+
455
+ ### Inference
456
+ ```shell
457
+ CUDA_VISIBLE_DEVICES=0 swift infer \
458
+ --model Qwen/Qwen2.5-7B-Instruct \
459
+ --stream true \
460
+ --infer_backend pt \
461
+ --max_new_tokens 2048
462
+
463
+ # LoRA
464
+ CUDA_VISIBLE_DEVICES=0 swift infer \
465
+ --model Qwen/Qwen2.5-7B-Instruct \
466
+ --adapters swift/test_lora \
467
+ --stream true \
468
+ --infer_backend pt \
469
+ --temperature 0 \
470
+ --max_new_tokens 2048
471
+ ```
472
+
473
+ ### Interface Inference
474
+ ```shell
475
+ CUDA_VISIBLE_DEVICES=0 swift app \
476
+ --model Qwen/Qwen2.5-7B-Instruct \
477
+ --stream true \
478
+ --infer_backend pt \
479
+ --max_new_tokens 2048
480
+ ```
481
+
482
+ ### Deployment
483
+ ```shell
484
+ CUDA_VISIBLE_DEVICES=0 swift deploy \
485
+ --model Qwen/Qwen2.5-7B-Instruct \
486
+ --infer_backend vllm
487
+ ```
488
+
489
+ ### Sampling
490
+ ```shell
491
+ CUDA_VISIBLE_DEVICES=0 swift sample \
492
+ --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
493
+ --sampler_engine pt \
494
+ --num_return_sequences 5 \
495
+ --dataset AI-ModelScope/alpaca-gpt4-data-zh#5
496
+ ```
497
+
498
+ ### Evaluation
499
+ ```shell
500
+ CUDA_VISIBLE_DEVICES=0 swift eval \
501
+ --model Qwen/Qwen2.5-7B-Instruct \
502
+ --infer_backend lmdeploy \
503
+ --eval_backend OpenCompass \
504
+ --eval_dataset ARC_c
505
+ ```
506
+
507
+ ### Quantization
508
+ ```shell
509
+ CUDA_VISIBLE_DEVICES=0 swift export \
510
+ --model Qwen/Qwen2.5-7B-Instruct \
511
+ --quant_bits 4 --quant_method awq \
512
+ --dataset AI-ModelScope/alpaca-gpt4-data-zh \
513
+ --output_dir Qwen2.5-7B-Instruct-AWQ
514
+ ```
515
+
516
+ ### Push Model
517
+ ```shell
518
+ swift export \
519
+ --model <model-path> \
520
+ --push_to_hub true \
521
+ --hub_model_id '<model-id>' \
522
+ --hub_token '<sdk-token>'
523
+ ```
524
+
525
+ ## 🏛 License
526
+
527
+ This framework is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). For models and datasets, please refer to the original resource page and follow the corresponding License.
528
+
529
+ ## 📎 Citation
530
+
531
+ ```bibtex
532
+ @misc{zhao2024swiftascalablelightweightinfrastructure,
533
+ title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
534
+ author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
535
+ year={2024},
536
+ eprint={2408.05517},
537
+ archivePrefix={arXiv},
538
+ primaryClass={cs.CL},
539
+ url={https://arxiv.org/abs/2408.05517},
540
+ }
541
+ ```
542
+
543
+ ## Star History
544
+
545
+ [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/swift&type=Date)](https://star-history.com/#modelscope/ms-swift&Date)
ms-swift/ms_swift.egg-info/not-zip-safe ADDED
@@ -0,0 +1 @@
 
 
1
+
ms-swift/requirements/install_all.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # please use python=3.10, cuda12.*
2
+ # sh requirements/install_all.sh
3
+ pip install "vllm>=0.5.1" -U
4
+ pip install "lmdeploy>=0.5" -U --no-deps
5
+ pip install autoawq -U --no-deps
6
+ pip install auto_gptq optimum bitsandbytes -U
7
+ pip install git+https://github.com/modelscope/ms-swift.git
8
+ pip install timm -U
9
+ pip install deepspeed -U
10
+ pip install qwen_vl_utils qwen_omni_utils decord librosa pyav icecream soundfile -U
11
+ pip install liger_kernel nvitop pre-commit -U
12
+ # flash-attn: https://github.com/Dao-AILab/flash-attention/releases
ms-swift/requirements/seq_parallel.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ xtuner
ms-swift/requirements/swanlab.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ swanlab
ms-swift/scripts/benchmark/config/tuner.json ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cmd": "sft",
3
+ "requirements":{
4
+ "gpu": "1",
5
+ "ddp": "1"
6
+ },
7
+ "eval_requirements": {
8
+ "gpu": "1"
9
+ },
10
+ "eval_dataset": ["ceval", "gsm8k", "arc"],
11
+ "args": {
12
+ "model": "Qwen/Qwen-7B-Chat",
13
+ "dataset": "iic/ms_agent",
14
+ "per_device_train_batch_size": 1,
15
+ "max_length": 2048,
16
+ "loss_scale": "react",
17
+ "gradient_accumulation_steps": 16,
18
+ "learning_rate": 5e-5,
19
+ "attn_impl": "flash_attn",
20
+ "eval_steps": 2000,
21
+ "save_steps": 2000,
22
+ "num_train_epochs": 2,
23
+ "gradient_checkpointing": true,
24
+ "weight_decay": 0.01,
25
+ "warmup_ratio": 0.03,
26
+ "save_total_limit": 2,
27
+ "logging_steps": 10
28
+ },
29
+ "experiment": [
30
+ {
31
+ "name": "lora",
32
+ "args": {
33
+ "train_type": "lora",
34
+ "lora_rank": 8,
35
+ "lora_alpha": 32
36
+ }
37
+ },
38
+ {
39
+ "name": "lora+packing",
40
+ "args": {
41
+ "train_type": "lora",
42
+ "lora_rank": 8,
43
+ "lora_alpha": 32,
44
+ "packing": true,
45
+ "eval_steps": 200,
46
+ "save_steps": 200
47
+ }
48
+ },
49
+ {
50
+ "name": "lora+packing+ddp",
51
+ "requirements":{
52
+ "gpu": "2",
53
+ "ddp": "2"
54
+ },
55
+ "args": {
56
+ "train_type": "lora",
57
+ "lora_rank": 8,
58
+ "lora_alpha": 32,
59
+ "packing": true,
60
+ "eval_steps": 100,
61
+ "save_steps": 100
62
+ }
63
+ },
64
+ {
65
+ "name": "lora+packing+lazytokenize",
66
+ "args": {
67
+ "train_type": "lora",
68
+ "lora_rank": 8,
69
+ "lora_alpha": 32,
70
+ "packing": true,
71
+ "lazy_tokenize": true,
72
+ "eval_steps": 200,
73
+ "save_steps": 200
74
+ }
75
+ },
76
+ {
77
+ "name": "lora+",
78
+ "args": {
79
+ "train_type": "lora",
80
+ "lora_rank": 8,
81
+ "lora_alpha": 32,
82
+ "lorap_lr_ratio": 16.0
83
+ }
84
+ },
85
+ {
86
+ "name": "rslora",
87
+ "args": {
88
+ "train_type": "lora",
89
+ "lora_rank": 8,
90
+ "lora_alpha": 32,
91
+ "use_rslora": true
92
+ }
93
+ },
94
+ {
95
+ "name": "dora",
96
+ "args": {
97
+ "train_type": "lora",
98
+ "lora_rank": 8,
99
+ "lora_alpha": 32,
100
+ "use_dora": true
101
+ }
102
+ },
103
+ {
104
+ "name": "lora+neftune",
105
+ "args": {
106
+ "train_type": "lora",
107
+ "lora_rank": 8,
108
+ "lora_alpha": 32,
109
+ "neftune_noise_alpha": 15.0
110
+ }
111
+ },
112
+ {
113
+ "name": "llamapro",
114
+ "args": {
115
+ "train_type": "llamapro",
116
+ "llamapro_num_new_blocks": "4"
117
+ }
118
+ },
119
+ {
120
+ "name": "full",
121
+ "requirements":{
122
+ "gpu": "1",
123
+ "ddp": "1"
124
+ },
125
+ "args": {
126
+ "train_type": "full"
127
+ }
128
+ },
129
+ {
130
+ "name": "reft",
131
+ "requirements":{
132
+ "gpu": "1",
133
+ "ddp": "1"
134
+ },
135
+ "args": {
136
+ "train_type": "reft",
137
+ "gradient_checkpointing": "false",
138
+ "loss_scale": "default"
139
+ }
140
+ },
141
+ {
142
+ "name": "full+galore128+quantize",
143
+ "requirements":{
144
+ "gpu": "1",
145
+ "ddp": "1"
146
+ },
147
+ "args": {
148
+ "train_type": "full",
149
+ "use_galore": "true",
150
+ "galore_rank": "128",
151
+ "galore_update_proj_gap": "200",
152
+ "galore_optim_per_parameter": "false",
153
+ "galore_with_embedding": "false",
154
+ "galore_quantization": "true"
155
+ }
156
+ },
157
+ {
158
+ "name": "full+galore128+quantize+proj_quant",
159
+ "requirements":{
160
+ "gpu": "1",
161
+ "ddp": "1"
162
+ },
163
+ "args": {
164
+ "train_type": "full",
165
+ "use_galore": "true",
166
+ "galore_rank": "128",
167
+ "galore_update_proj_gap": "200",
168
+ "galore_optim_per_parameter": "false",
169
+ "galore_with_embedding": "false",
170
+ "galore_quantization": "true",
171
+ "galore_proj_quant": "true"
172
+ }
173
+ },
174
+ {
175
+ "name": "full+galore128",
176
+ "requirements":{
177
+ "gpu": "1",
178
+ "ddp": "1"
179
+ },
180
+ "args": {
181
+ "train_type": "full",
182
+ "use_galore": "true",
183
+ "galore_rank": "128",
184
+ "galore_update_proj_gap": "200",
185
+ "galore_optim_per_parameter": "false",
186
+ "galore_with_embedding": "false"
187
+ }
188
+ },
189
+ {
190
+ "name": "full+galore64",
191
+ "requirements":{
192
+ "gpu": "1",
193
+ "ddp": "1"
194
+ },
195
+ "args": {
196
+ "train_type": "full",
197
+ "use_galore": "true",
198
+ "galore_rank": "64",
199
+ "galore_update_proj_gap": "200",
200
+ "galore_optim_per_parameter": "false",
201
+ "galore_with_embedding": "false"
202
+ }
203
+ },
204
+ {
205
+ "name": "full+galore32",
206
+ "requirements":{
207
+ "gpu": "1",
208
+ "ddp": "1"
209
+ },
210
+ "args": {
211
+ "train_type": "full",
212
+ "use_galore": "true",
213
+ "galore_rank": "32",
214
+ "galore_update_proj_gap": "200",
215
+ "galore_optim_per_parameter": "false",
216
+ "galore_with_embedding": "false"
217
+ }
218
+ },
219
+ {
220
+ "name": "full+galore_emb",
221
+ "requirements":{
222
+ "gpu": "1",
223
+ "ddp": "1"
224
+ },
225
+ "args": {
226
+ "train_type": "full",
227
+ "use_galore": "true",
228
+ "galore_rank": "128",
229
+ "galore_update_proj_gap": "200",
230
+ "galore_optim_per_parameter": "false",
231
+ "galore_with_embedding": "true"
232
+ }
233
+ },
234
+ {
235
+ "name": "full+galore_perparam",
236
+ "requirements":{
237
+ "gpu": "1",
238
+ "ddp": "1"
239
+ },
240
+ "args": {
241
+ "train_type": "full",
242
+ "use_galore": "true",
243
+ "galore_rank": "128",
244
+ "galore_update_proj_gap": "200",
245
+ "galore_optim_per_parameter": "true",
246
+ "galore_with_embedding": "false"
247
+ }
248
+ },
249
+ {
250
+ "name": "adalora",
251
+ "args": {
252
+ "train_type": "adalora",
253
+ "lora_rank": 8,
254
+ "lora_alpha": 32
255
+ }
256
+ },
257
+ {
258
+ "name": "adapter",
259
+ "args": {
260
+ "train_type": "adapter"
261
+ }
262
+ },
263
+ {
264
+ "name": "full+lisa_2",
265
+ "info": "lisa 2layers + full",
266
+ "args": {
267
+ "train_type": "full",
268
+ "lisa_activated_layers": 2,
269
+ "lisa_step_interval": 20
270
+ }
271
+ },
272
+ {
273
+ "name": "full+lisa_4",
274
+ "info": "lisa 4layers + full",
275
+ "args": {
276
+ "train_type": "full",
277
+ "lisa_activated_layers": 4,
278
+ "lisa_step_interval": 20
279
+ }
280
+ },
281
+ {
282
+ "name": "unsloth+lora+q4",
283
+ "info": "unsloth lora quantization bit 4",
284
+ "args": {
285
+ "train_type": "lora",
286
+ "tuner_backend": "unsloth",
287
+ "quantization_bit": 4,
288
+ "model": "LLM-Research/Meta-Llama-3-8B-Instruct"
289
+ }
290
+ },
291
+ {
292
+ "name": "unsloth+full",
293
+ "info": "unsloth full",
294
+ "args": {
295
+ "train_type": "full",
296
+ "tuner_backend": "unsloth",
297
+ "model_type": "LLM-Research/Meta-Llama-3-8B-Instruct"
298
+ }
299
+ }
300
+ ]
301
+ }
ms-swift/scripts/benchmark/exp.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import argparse
3
+ import os
4
+ import os.path
5
+
6
+ from exp_utils import ExpManager, find_all_config
7
+
8
+ from swift.utils import *
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ def parse_args():
14
+ parser = argparse.ArgumentParser(description='Simple args for swift experiments.')
15
+ parser.add_argument(
16
+ '--config',
17
+ type=str,
18
+ default=None,
19
+ required=True,
20
+ help='The experiment config file',
21
+ )
22
+ parser.add_argument(
23
+ '--save_dir',
24
+ type=str,
25
+ default='./experiment',
26
+ required=False,
27
+ help='The experiment output folder',
28
+ )
29
+
30
+ args = parser.parse_args()
31
+ return args
32
+
33
+
34
+ def llm_exp():
35
+ args = parse_args()
36
+ config: str = args.config
37
+ config = config.split(',')
38
+ os.makedirs(args.save_dir, exist_ok=True)
39
+ all_configs = []
40
+ if not isinstance(config, list):
41
+ config = [config]
42
+ for dir_or_file in config:
43
+ all_configs.extend(find_all_config(dir_or_file))
44
+ args.config = all_configs
45
+ exp_manager = ExpManager()
46
+ exp_manager.begin(args)
47
+
48
+
49
+ if __name__ == '__main__':
50
+ llm_exp()
ms-swift/scripts/benchmark/generate_report.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import dataclasses
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, List
6
+
7
+ import json
8
+ import numpy as np
9
+
10
+ from swift.llm.template import split_str_parts_by
11
+
12
+
13
+ @dataclass
14
+ class ModelOutput:
15
+
16
+ group: str = None
17
+
18
+ name: str = None
19
+
20
+ cmd: str = None
21
+
22
+ requirements: Dict[str, str] = dataclasses.field(default_factory=dict)
23
+
24
+ args: Dict[str, Any] = dataclasses.field(default_factory=dict)
25
+
26
+ memory: str = None
27
+
28
+ train_time: float = None
29
+
30
+ train_samples: int = None
31
+
32
+ train_samples_per_second: float = None
33
+
34
+ last_model_checkpoint: str = None
35
+
36
+ best_model_checkpoint: str = None
37
+
38
+ best_metric: Any = None
39
+
40
+ global_step: int = None
41
+
42
+ num_total_parameters: float = None
43
+
44
+ num_trainable_parameters: float = None
45
+
46
+ num_buffers: float = None
47
+
48
+ trainable_parameters_percentage: float = None
49
+
50
+ train_dataset_info: str = None
51
+
52
+ val_dataset_info: str = None
53
+
54
+ train_create_time: float = None
55
+
56
+ eval_tokens: int = None
57
+
58
+ eval_time: float = None
59
+
60
+ reports: Dict[str, Any] = None
61
+
62
+ train_loss: float = None
63
+
64
+ @property
65
+ def tuner_hyper_params(self):
66
+ hyper_params = ''
67
+ args = self.args
68
+ if 'sft_type' not in args:
69
+ return ''
70
+ if args['sft_type'] in ('lora', 'adalora', 'longlora'):
71
+ if 'lora_rank' in args:
72
+ hyper_params += f'rank={args["lora_rank"]}/' \
73
+ f'target={args["lora_target_modules"]}/' \
74
+ f'alpha={args["lora_alpha"]}/' \
75
+ f'lr_ratio={args.get("lora_lr_ratio", None)}/' \
76
+ f'use_rslora={args.get("use_rslora", False)}/' \
77
+ f'use_dora={args.get("use_dora", False)}'
78
+ else:
79
+ hyper_params = ''
80
+ if args['sft_type'] == 'full':
81
+ if 'use_galore' in args and args['use_galore'] == 'true':
82
+ hyper_params += f'galore_rank={args["galore_rank"]}/' \
83
+ f'galore_per_parameter={args["galore_optim_per_parameter"]}/' \
84
+ f'galore_with_embedding={args["galore_with_embedding"]}/'
85
+ if args['sft_type'] == 'llamapro':
86
+ hyper_params += f'num_blocks={args["llamapro_num_new_blocks"]}/'
87
+ if 'neftune_noise_alpha' in args and args['neftune_noise_alpha']:
88
+ hyper_params += f'neftune_noise_alpha={args["neftune_noise_alpha"]}/'
89
+
90
+ if hyper_params.endswith('/'):
91
+ hyper_params = hyper_params[:-1]
92
+ return hyper_params
93
+
94
+ @property
95
+ def hyper_parameters(self):
96
+ if 'learning_rate' not in self.args:
97
+ return ''
98
+ return f'lr={self.args["learning_rate"]}/' \
99
+ f'epoch={self.args["num_train_epochs"]}'
100
+
101
+ @property
102
+ def train_speed(self):
103
+ if self.train_samples_per_second:
104
+ return f'{self.train_samples_per_second:.2f}({self.train_samples} samples/{self.train_time:.2f} seconds)'
105
+ else:
106
+ return ''
107
+
108
+ @property
109
+ def infer_speed(self):
110
+ if self.eval_tokens:
111
+ return f'{self.eval_tokens / self.eval_time:.2f}({self.eval_tokens} tokens/{self.eval_time:.2f} seconds)'
112
+ return ''
113
+
114
+
115
+ def generate_sft_report(outputs: List[ModelOutput]):
116
+ gsm8k_accs = []
117
+ arc_accs = []
118
+ ceval_accs = []
119
+ for output in outputs:
120
+ gsm8k_acc = None
121
+ arc_acc = None
122
+ ceval_acc = None
123
+ for report in (output.reports or []):
124
+ if report['name'] == 'gsm8k':
125
+ gsm8k_acc = report['score']
126
+ if report['name'] == 'arc':
127
+ arc_acc = report['score']
128
+ if report['name'] == 'ceval':
129
+ ceval_acc = report['score']
130
+ gsm8k_accs.append(gsm8k_acc)
131
+ arc_accs.append(arc_acc)
132
+ ceval_accs.append(ceval_acc)
133
+
134
+ tab = '| exp_name | model_type | dataset | ms-bench mix ratio | tuner | tuner_params | trainable params(M) | flash_attn | gradient_checkpointing | hypers | memory | train speed(samples/s) | infer speed(tokens/s) | train_loss | eval_loss | gsm8k weighted acc | arc weighted acc | ceval weighted acc |\n' \
135
+ '| -------- | ---------- | ------- | -------------------| ----- | ------------ | ------------------- | -----------| ---------------------- | ------ | ------ | ---------------------- | --------------------- | ---------- | --------- | ------------------ | ---------------- | ------------------ |\n' # noqa
136
+ min_best_metric = 999.
137
+ min_train_loss = 999.
138
+ if outputs:
139
+ min_best_metric = min([output.best_metric or 999. for output in outputs])
140
+ min_train_loss = min([output.train_loss or 999. for output in outputs])
141
+
142
+ max_gsm8k = 0.0
143
+ if gsm8k_accs:
144
+ max_gsm8k = max([gsm8k or 0. for gsm8k in gsm8k_accs])
145
+
146
+ max_arc = 0.0
147
+ if arc_accs:
148
+ max_arc = max([arc or 0. for arc in arc_accs])
149
+
150
+ max_ceval = 0.0
151
+ if ceval_accs:
152
+ max_ceval = max([ceval or 0. for ceval in ceval_accs])
153
+
154
+ for output, gsm8k_acc, arc_acc, ceval_acc in zip(outputs, gsm8k_accs, arc_accs, ceval_accs):
155
+ use_flash_attn = output.args.get('use_flash_attn', '')
156
+ use_gc = output.args.get('gradient_checkpointing', '')
157
+ memory = output.memory
158
+ train_speed = output.train_speed
159
+ infer_speed = output.infer_speed
160
+
161
+ is_best_metric = np.isclose(min_best_metric, output.best_metric or 999.0)
162
+ is_best_loss = np.isclose(min_train_loss, output.train_loss or 999.0)
163
+ is_best_gsm8k = np.isclose(max_gsm8k, gsm8k_acc or 0.0)
164
+ is_best_arc = np.isclose(max_arc, arc_acc or 0.0)
165
+ is_best_ceval = np.isclose(max_ceval, ceval_acc or 0.0)
166
+
167
+ if not is_best_metric:
168
+ best_metric = '' if not output.best_metric else f'{output.best_metric:.2f}'
169
+ else:
170
+ best_metric = '' if not output.best_metric else f'**{output.best_metric:.2f}**'
171
+
172
+ if not is_best_loss:
173
+ train_loss = '' if not output.train_loss else f'{output.train_loss:.2f}'
174
+ else:
175
+ train_loss = '' if not output.train_loss else f'**{output.train_loss:.2f}**'
176
+
177
+ if not is_best_gsm8k:
178
+ gsm8k_acc = '' if not gsm8k_acc else f'{gsm8k_acc:.3f}'
179
+ else:
180
+ gsm8k_acc = '' if not gsm8k_acc else f'**{gsm8k_acc:.3f}**'
181
+
182
+ if not is_best_arc:
183
+ arc_acc = '' if not arc_acc else f'{arc_acc:.3f}'
184
+ else:
185
+ arc_acc = '' if not arc_acc else f'**{arc_acc:.3f}**'
186
+
187
+ if not is_best_ceval:
188
+ ceval_acc = '' if not ceval_acc else f'{ceval_acc:.3f}'
189
+ else:
190
+ ceval_acc = '' if not ceval_acc else f'**{ceval_acc:.3f}**'
191
+
192
+ line = f'|{output.name}|' \
193
+ f'{output.args["model_type"]}|' \
194
+ f'{output.args.get("dataset")}|' \
195
+ f'{output.args.get("train_dataset_mix_ratio", 0.)}|' \
196
+ f'{output.args.get("sft_type")}|' \
197
+ f'{output.tuner_hyper_params}|' \
198
+ f'{output.num_trainable_parameters}({output.trainable_parameters_percentage})|' \
199
+ f'{use_flash_attn}|' \
200
+ f'{use_gc}|' \
201
+ f'{output.hyper_parameters}|' \
202
+ f'{memory}|' \
203
+ f'{train_speed}|' \
204
+ f'{infer_speed}|' \
205
+ f'{best_metric}|' \
206
+ f'{train_loss}|' \
207
+ f'{gsm8k_acc}|' \
208
+ f'{arc_acc}|' \
209
+ f'{ceval_acc}|\n'
210
+ tab += line
211
+ return tab
212
+
213
+
214
+ def generate_export_report(outputs: List[ModelOutput]):
215
+ tab = '| exp_name | model_type | calibration dataset | quantization method | quantization bits | infer speed(tokens/s) | gsm8k weighted acc | arc weighted acc | ceval weighted acc |\n' \
216
+ '| -------- | ---------- | ------------------- | ------------------- | ----------------- | --------------------- | ------------------ | ---------------- | ------------------ |\n' # noqa
217
+
218
+ gsm8k_accs = []
219
+ arc_accs = []
220
+ ceval_accs = []
221
+ for output in outputs:
222
+ gsm8k_acc = None
223
+ arc_acc = None
224
+ ceval_acc = None
225
+ for report in (output.reports or []):
226
+ if report['name'] == 'gsm8k':
227
+ gsm8k_acc = report['score']
228
+ if report['name'] == 'arc':
229
+ arc_acc = report['score']
230
+ if report['name'] == 'ceval':
231
+ ceval_acc = report['score']
232
+ gsm8k_accs.append(gsm8k_acc)
233
+ arc_accs.append(arc_acc)
234
+ ceval_accs.append(ceval_acc)
235
+
236
+ max_gsm8k = 0.0
237
+ if gsm8k_accs:
238
+ max_gsm8k = max([gsm8k or 0. for gsm8k in gsm8k_accs])
239
+
240
+ max_arc = 0.0
241
+ if arc_accs:
242
+ max_arc = max([arc or 0. for arc in arc_accs])
243
+
244
+ max_ceval = 0.0
245
+ if ceval_accs:
246
+ max_ceval = max([ceval or 0. for ceval in ceval_accs])
247
+
248
+ for output, gsm8k_acc, arc_acc, ceval_acc in zip(outputs, gsm8k_accs, arc_accs, ceval_accs):
249
+ infer_speed = output.infer_speed
250
+ is_best_gsm8k = np.isclose(max_gsm8k, gsm8k_acc or 0.0)
251
+ is_best_arc = np.isclose(max_arc, arc_acc or 0.0)
252
+ is_best_ceval = np.isclose(max_ceval, ceval_acc or 0.0)
253
+
254
+ if not is_best_gsm8k:
255
+ gsm8k_acc = '' if not gsm8k_acc else f'{gsm8k_acc:.3f}'
256
+ else:
257
+ gsm8k_acc = '' if not gsm8k_acc else f'**{gsm8k_acc:.3f}**'
258
+
259
+ if not is_best_arc:
260
+ arc_acc = '' if not arc_acc else f'{arc_acc:.3f}'
261
+ else:
262
+ arc_acc = '' if not arc_acc else f'**{arc_acc:.3f}**'
263
+
264
+ if not is_best_ceval:
265
+ ceval_acc = '' if not ceval_acc else f'{ceval_acc:.3f}'
266
+ else:
267
+ ceval_acc = '' if not ceval_acc else f'**{ceval_acc:.3f}**'
268
+
269
+ if output.train_dataset_info:
270
+ dataset_info = f'{output.args["dataset"]}/{output.train_dataset_info}'
271
+ else:
272
+ dataset_info = f'{output.args["dataset"]}'
273
+ line = f'|{output.name}|' \
274
+ f'{output.args["model_type"]}|' \
275
+ f'{dataset_info}|' \
276
+ f'{output.args["quant_method"]}|' \
277
+ f'{output.args["quant_bits"]}|' \
278
+ f'{infer_speed}|' \
279
+ f'{gsm8k_acc}|' \
280
+ f'{arc_acc}|' \
281
+ f'{ceval_acc}|\n'
282
+ tab += line
283
+ return tab
284
+
285
+
286
+ def parse_output(file):
287
+ with open(file, 'r', encoding='utf-8') as f:
288
+ content = json.load(f)
289
+
290
+ name = content['name']
291
+ group = content['group']
292
+ cmd = content['cmd']
293
+ requirements = content['requirements']
294
+ args = content['args']
295
+ create_time = float(content.get('create_time') or 0)
296
+ content = content['record']
297
+ if cmd == 'export':
298
+ best_model_checkpoint = content['best_model_checkpoint']
299
+ eval_tokens = 0
300
+ eval_time = 0.0
301
+ eval_result = None
302
+ if 'eval_result' in content:
303
+ eval_result = content['eval_result']
304
+ eval_tokens = eval_result['generation_info']['tokens']
305
+ eval_time = eval_result['generation_info']['time']
306
+ eval_result = eval_result['report']
307
+ return ModelOutput(
308
+ group=group,
309
+ name=name,
310
+ cmd=cmd,
311
+ requirements=requirements,
312
+ args=args,
313
+ best_model_checkpoint=best_model_checkpoint,
314
+ eval_time=eval_time,
315
+ eval_tokens=eval_tokens,
316
+ reports=eval_result,
317
+ )
318
+ else:
319
+ memory = None
320
+ train_time = None
321
+ train_samples = None
322
+ train_samples_per_second = None
323
+ last_model_checkpoint = None
324
+ best_model_checkpoint = None
325
+ best_metric = None
326
+ global_step = None
327
+ train_dataset_info = None
328
+ val_dataset_info = None
329
+ num_trainable_parameters = None
330
+ num_buffers = None
331
+ trainable_parameters_percentage = None
332
+ num_total_parameters = None
333
+ train_loss = None
334
+ if 'memory' in content:
335
+ memory = content['memory']
336
+ memory = '/'.join(memory.values())
337
+ if 'train_time' in content:
338
+ train_time = content['train_time']['train_runtime']
339
+ train_samples = content['train_time']['n_train_samples']
340
+ train_samples_per_second = content['train_time']['train_samples_per_second']
341
+ if 'last_model_checkpoint' in content:
342
+ last_model_checkpoint = content['last_model_checkpoint']
343
+ if 'best_model_checkpoint' in content:
344
+ best_model_checkpoint = content['best_model_checkpoint']
345
+ if 'best_metric' in content:
346
+ best_metric = content['best_metric']
347
+ if 'log_history' in content:
348
+ train_loss = content['log_history'][-1]['train_loss']
349
+ if 'global_step' in content:
350
+ global_step = content['global_step']
351
+ if 'dataset_info' in content:
352
+ train_dataset_info = content['dataset_info'].get('train_dataset')
353
+ val_dataset_info = content['dataset_info'].get('val_dataset')
354
+ if 'model_info' in content:
355
+ # model_info like: SwiftModel: 6758.4041M Params (19.9885M Trainable [0.2958%]), 16.7793M Buffers.
356
+ str_dict = split_str_parts_by(content['model_info'], [
357
+ 'SwiftModel:', 'CausalLM:', 'Seq2SeqLM:', 'LMHeadModel:', 'M Params (', 'M Trainable [', ']), ',
358
+ 'M Buffers.'
359
+ ])
360
+ str_dict = {c['key']: c['content'] for c in str_dict}
361
+ if 'SwiftModel:' in str_dict:
362
+ num_total_parameters = float(str_dict['SwiftModel:'])
363
+ elif 'CausalLM:' in str_dict:
364
+ num_total_parameters = float(str_dict['CausalLM:'])
365
+ elif 'Seq2SeqLM:' in str_dict:
366
+ num_total_parameters = float(str_dict['Seq2SeqLM:'])
367
+ elif 'LMHeadModel:' in str_dict:
368
+ num_total_parameters = float(str_dict['LMHeadModel:'])
369
+ num_trainable_parameters = float(str_dict['M Params ('])
370
+ num_buffers = float(str_dict[']), '])
371
+ trainable_parameters_percentage = str_dict['M Trainable [']
372
+
373
+ eval_tokens = 0
374
+ eval_time = 0.0
375
+ eval_result = None
376
+ if 'eval_result' in content:
377
+ eval_result = content['eval_result']
378
+ eval_tokens = eval_result['generation_info']['tokens']
379
+ eval_time = eval_result['generation_info']['time']
380
+ eval_result = eval_result['report']
381
+
382
+ return ModelOutput(
383
+ group=group,
384
+ name=name,
385
+ cmd=cmd,
386
+ requirements=requirements,
387
+ args=args,
388
+ memory=memory,
389
+ train_time=train_time,
390
+ train_samples=train_samples,
391
+ train_samples_per_second=train_samples_per_second,
392
+ last_model_checkpoint=last_model_checkpoint,
393
+ best_model_checkpoint=best_model_checkpoint,
394
+ best_metric=best_metric,
395
+ global_step=global_step,
396
+ train_dataset_info=train_dataset_info,
397
+ val_dataset_info=val_dataset_info,
398
+ train_create_time=create_time,
399
+ num_total_parameters=num_total_parameters,
400
+ num_trainable_parameters=num_trainable_parameters,
401
+ num_buffers=num_buffers,
402
+ trainable_parameters_percentage=trainable_parameters_percentage,
403
+ eval_time=eval_time,
404
+ eval_tokens=eval_tokens,
405
+ reports=eval_result,
406
+ train_loss=train_loss,
407
+ )
408
+
409
+
410
+ def generate_reports():
411
+ outputs = []
412
+ for dirs, _, files in os.walk('./experiment'):
413
+ for file in files:
414
+ abs_file = os.path.join(dirs, file)
415
+ if not abs_file.endswith('.json') or 'ipynb' in abs_file:
416
+ continue
417
+
418
+ outputs.append(parse_output(abs_file))
419
+
420
+ all_groups = set([output.group for output in outputs])
421
+ for group in all_groups:
422
+ group_outputs = [output for output in outputs if output.group == group]
423
+ print(f'=================Printing the sft cmd result of exp {group}==================\n\n')
424
+ print(generate_sft_report([output for output in group_outputs if output.cmd in ('sft', 'eval')]))
425
+ # print(f'=================Printing the dpo result of exp {group}==================')
426
+ # print(generate_dpo_report([output for output in outputs if output.cmd == 'dpo']))
427
+ print(f'=================Printing the export cmd result of exp {group}==================\n\n')
428
+ print(generate_export_report([output for output in group_outputs if output.cmd == 'export']))
429
+ print('=================Printing done==================\n\n')
430
+
431
+
432
+ if __name__ == '__main__':
433
+ generate_reports()
ms-swift/scripts/utils/run_dataset_info.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import numpy as np
5
+
6
+ from swift.llm import DATASET_MAPPING, EncodePreprocessor, get_model_tokenizer, get_template, load_dataset
7
+ from swift.utils import stat_array
8
+
9
+ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
10
+
11
+
12
+ def get_cache_mapping(fpath):
13
+ with open(fpath, 'r', encoding='utf-8') as f:
14
+ text = f.read()
15
+ idx = text.find('| Dataset ID |')
16
+ text = text[idx:]
17
+ text_list = text.split('\n')[2:]
18
+ cache_mapping = {} # dataset_id -> (dataset_size, stat)
19
+ for text in text_list:
20
+ if not text:
21
+ continue
22
+ items = text.split('|')
23
+ key = items[1] if items[1] != '-' else items[6]
24
+ key = re.search(r'\[(.+?)\]', key).group(1)
25
+ stat = items[3:5]
26
+ if stat[0] == '-':
27
+ stat = ('huge dataset', '-')
28
+ cache_mapping[key] = stat
29
+ return cache_mapping
30
+
31
+
32
+ def get_dataset_id(key):
33
+ for dataset_id in key:
34
+ if dataset_id is not None:
35
+ break
36
+ return dataset_id
37
+
38
+
39
+ def run_dataset(key, template, cache_mapping):
40
+ ms_id, hf_id, _ = key
41
+ dataset_meta = DATASET_MAPPING[key]
42
+ tags = ', '.join(tag for tag in dataset_meta.tags) or '-'
43
+ dataset_id = ms_id or hf_id
44
+ use_hf = ms_id is None
45
+ if ms_id is not None:
46
+ ms_id = f'[{ms_id}](https://modelscope.cn/datasets/{ms_id})'
47
+ else:
48
+ ms_id = '-'
49
+ if hf_id is not None:
50
+ hf_id = f'[{hf_id}](https://huggingface.co/datasets/{hf_id})'
51
+ else:
52
+ hf_id = '-'
53
+ subsets = '<br>'.join(subset.name for subset in dataset_meta.subsets)
54
+
55
+ if dataset_meta.huge_dataset:
56
+ dataset_size = 'huge dataset'
57
+ stat_str = '-'
58
+ elif dataset_id in cache_mapping:
59
+ dataset_size, stat_str = cache_mapping[dataset_id]
60
+ else:
61
+ num_proc = 4
62
+ dataset, _ = load_dataset(f'{dataset_id}:all', strict=False, num_proc=num_proc, use_hf=use_hf)
63
+ dataset_size = len(dataset)
64
+ random_state = np.random.RandomState(42)
65
+ idx_list = random_state.choice(dataset_size, size=min(dataset_size, 100000), replace=False)
66
+ encoded_dataset = EncodePreprocessor(template)(dataset.select(idx_list), num_proc=num_proc)
67
+
68
+ input_ids = encoded_dataset['input_ids']
69
+ token_len = [len(tokens) for tokens in input_ids]
70
+ stat = stat_array(token_len)[0]
71
+ stat_str = f"{stat['mean']:.1f}±{stat['std']:.1f}, min={stat['min']}, max={stat['max']}"
72
+
73
+ return f'|{ms_id}|{subsets}|{dataset_size}|{stat_str}|{tags}|{hf_id}|'
74
+
75
+
76
+ def write_dataset_info() -> None:
77
+ fpaths = ['docs/source/Instruction/支持的模型和数据集.md', 'docs/source_en/Instruction/Supported-models-and-datasets.md']
78
+ cache_mapping = get_cache_mapping(fpaths[0])
79
+ res_text_list = []
80
+ res_text_list.append('| Dataset ID | Subset Name | Dataset Size | Statistic (token) | Tags | HF Dataset ID |')
81
+ res_text_list.append('| ---------- | ----------- | -------------| ------------------| ---- | ------------- |')
82
+
83
+ all_keys = list(DATASET_MAPPING.keys())
84
+ all_keys = sorted(all_keys, key=lambda x: get_dataset_id(x))
85
+ _, tokenizer = get_model_tokenizer('Qwen/Qwen2.5-7B-Instruct', load_model=False)
86
+ template = get_template(tokenizer.model_meta.template, tokenizer)
87
+ try:
88
+ for i, key in enumerate(all_keys):
89
+ res = run_dataset(key, template, cache_mapping)
90
+ res_text_list.append(res)
91
+ print(res)
92
+ finally:
93
+ for fpath in fpaths:
94
+ with open(fpath, 'r', encoding='utf-8') as f:
95
+ text = f.read()
96
+ idx = text.find('| Dataset ID |')
97
+
98
+ new_text = '\n'.join(res_text_list)
99
+ text = text[:idx] + new_text + '\n'
100
+ with open(fpath, 'w', encoding='utf-8') as f:
101
+ f.write(text)
102
+ print(f'数据集总数: {len(all_keys)}')
103
+
104
+
105
+ if __name__ == '__main__':
106
+ write_dataset_info()
ms-swift/scripts/utils/run_template.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from swift.llm import TemplateType
2
+
3
+ if __name__ == '__main__':
4
+ template_name_list = TemplateType.get_template_name_list()
5
+ tn_gen = ', '.join([tn for tn in template_name_list if 'generation' in tn])
6
+ tn_chat = ', '.join([tn for tn in template_name_list if 'generation' not in tn])
7
+ print(f'Text Generation: {tn_gen}')
8
+ print(f'Chat: {tn_chat}')
ms-swift/swift/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from .utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .version import __version__, __release_datetime__
8
+ from .tuners import (Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig, SWIFT_MAPPING,
9
+ AdaLoraConfig, LoftQConfig, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, PeftConfig,
10
+ PeftModel, PeftModelForCausalLM, ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM,
11
+ PeftModelForSequenceClassification, PeftModelForTokenClassification, PrefixTuningConfig,
12
+ PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, get_peft_config, get_peft_model,
13
+ get_peft_model_state_dict, Prompt, PromptConfig, PromptModule, SwiftConfig, SwiftOutput, Swift,
14
+ SwiftTuners, LongLoRAConfig, LongLoRA, LongLoRAModelType, SCETuning, SCETuningConfig)
15
+ from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy,
16
+ SchedulerType, ShardedDDPOption, TrainingArguments, Seq2SeqTrainingArguments, Trainer,
17
+ Seq2SeqTrainer)
18
+ from .utils import get_logger
19
+ else:
20
+ _import_structure = {
21
+ 'version': ['__release_datetime__', '__version__'],
22
+ 'tuners': [
23
+ 'Adapter', 'AdapterConfig', 'AdapterModule', 'SwiftModel', 'LoRA', 'LoRAConfig', 'SWIFT_MAPPING',
24
+ 'LoraConfig', 'AdaLoraConfig', 'LoftQConfig', 'LoHaConfig', 'LoKrConfig', 'OFTConfig', 'PeftConfig',
25
+ 'ResTuningConfig', 'SideConfig', 'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM',
26
+ 'PeftModelForSequenceClassification', 'PeftModelForTokenClassification', 'PrefixTuningConfig',
27
+ 'PromptEncoderConfig', 'PromptLearningConfig', 'PromptTuningConfig', 'get_peft_config', 'get_peft_model',
28
+ 'get_peft_model_state_dict', 'Prompt', 'PromptConfig', 'PromptModule', 'SwiftConfig', 'SwiftOutput',
29
+ 'Swift', 'SwiftTuners', 'LongLoRAConfig', 'LongLoRA', 'LongLoRAModelType', 'SCETuning', 'SCETuningConfig'
30
+ ],
31
+ 'trainers': [
32
+ 'EvaluationStrategy',
33
+ 'FSDPOption',
34
+ 'HPSearchBackend',
35
+ 'HubStrategy',
36
+ 'IntervalStrategy',
37
+ 'SchedulerType',
38
+ 'ShardedDDPOption',
39
+ 'TrainingArguments',
40
+ 'Seq2SeqTrainingArguments',
41
+ 'Trainer',
42
+ 'Seq2SeqTrainer',
43
+ ],
44
+ 'utils': ['get_logger']
45
+ }
46
+
47
+ import sys
48
+
49
+ sys.modules[__name__] = _LazyModule(
50
+ __name__,
51
+ globals()['__file__'],
52
+ _import_structure,
53
+ module_spec=__spec__,
54
+ extra_objects={},
55
+ )
ms-swift/swift/cli/__init__.py ADDED
File without changes
ms-swift/swift/cli/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (164 Bytes). View file
 
ms-swift/swift/cli/_megatron/pt.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from swift.megatron import megatron_pt_main
2
+
3
+ if __name__ == '__main__':
4
+ megatron_pt_main()
ms-swift/swift/cli/_megatron/sft.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from swift.megatron import megatron_sft_main
2
+
3
+ if __name__ == '__main__':
4
+ megatron_sft_main()
ms-swift/swift/cli/app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from swift.llm import app_main
2
+
3
+ if __name__ == '__main__':
4
+ app_main()
ms-swift/swift/cli/eval.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from swift.llm import eval_main
3
+
4
+ if __name__ == '__main__':
5
+ eval_main()
ms-swift/swift/cli/export.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from swift.llm import export_main
3
+
4
+ if __name__ == '__main__':
5
+ export_main()
ms-swift/swift/cli/main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import importlib.util
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ from typing import Dict, List, Optional
7
+
8
+ from swift.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+ ROUTE_MAPPING: Dict[str, str] = {
13
+ 'pt': 'swift.cli.pt',
14
+ 'sft': 'swift.cli.sft',
15
+ 'infer': 'swift.cli.infer',
16
+ 'merge-lora': 'swift.cli.merge_lora',
17
+ 'web-ui': 'swift.cli.web_ui',
18
+ 'deploy': 'swift.cli.deploy',
19
+ 'rollout': 'swift.cli.rollout',
20
+ 'rlhf': 'swift.cli.rlhf',
21
+ 'sample': 'swift.cli.sample',
22
+ 'export': 'swift.cli.export',
23
+ 'eval': 'swift.cli.eval',
24
+ 'app': 'swift.cli.app',
25
+ }
26
+
27
+
28
+ def use_torchrun() -> bool:
29
+ nproc_per_node = os.getenv('NPROC_PER_NODE')
30
+ nnodes = os.getenv('NNODES')
31
+ if nproc_per_node is None and nnodes is None:
32
+ return False
33
+ return True
34
+
35
+
36
+ def get_torchrun_args() -> Optional[List[str]]:
37
+ if not use_torchrun():
38
+ return
39
+ torchrun_args = []
40
+ for env_key in ['NPROC_PER_NODE', 'MASTER_PORT', 'NNODES', 'NODE_RANK', 'MASTER_ADDR']:
41
+ env_val = os.getenv(env_key)
42
+ if env_val is None:
43
+ continue
44
+ torchrun_args += [f'--{env_key.lower()}', env_val]
45
+ return torchrun_args
46
+
47
+
48
+ def _compat_web_ui(argv):
49
+ # [compat]
50
+ method_name = argv[0]
51
+ if method_name in {'web-ui', 'web_ui'} and ('--model' in argv or '--adapters' in argv or '--ckpt_dir' in argv):
52
+ argv[0] = 'app'
53
+ logger.warning('Please use `swift app`.')
54
+
55
+
56
+ def cli_main(route_mapping: Optional[Dict[str, str]] = None) -> None:
57
+ route_mapping = route_mapping or ROUTE_MAPPING
58
+ argv = sys.argv[1:]
59
+ _compat_web_ui(argv)
60
+ method_name = argv[0].replace('_', '-')
61
+ argv = argv[1:]
62
+ file_path = importlib.util.find_spec(route_mapping[method_name]).origin
63
+ torchrun_args = get_torchrun_args()
64
+ python_cmd = sys.executable
65
+ if torchrun_args is None or method_name not in {'pt', 'sft', 'rlhf', 'infer'}:
66
+ args = [python_cmd, file_path, *argv]
67
+ else:
68
+ args = [python_cmd, '-m', 'torch.distributed.run', *torchrun_args, file_path, *argv]
69
+ print(f"run sh: `{' '.join(args)}`", flush=True)
70
+ result = subprocess.run(args)
71
+ if result.returncode != 0:
72
+ sys.exit(result.returncode)
73
+
74
+
75
+ if __name__ == '__main__':
76
+ cli_main()
ms-swift/swift/cli/pt.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from swift.llm import pt_main
3
+
4
+ if __name__ == '__main__':
5
+ pt_main()
ms-swift/swift/cli/rollout.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from swift.llm import rollout_main
3
+
4
+ if __name__ == '__main__':
5
+ rollout_main()
ms-swift/swift/hub/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (231 Bytes). View file
 
ms-swift/swift/hub/__pycache__/hub.cpython-310.pyc ADDED
Binary file (13.4 kB). View file
 
ms-swift/swift/llm/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (3.92 kB). View file
 
ms-swift/swift/llm/__pycache__/data_loader.cpython-310.pyc ADDED
Binary file (4.21 kB). View file
 
ms-swift/swift/llm/app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .app import SwiftApp, app_main
ms-swift/swift/llm/argument/app_args.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dataclasses import dataclass
3
+ from typing import Literal, Optional
4
+
5
+ from swift.utils import find_free_port, get_logger
6
+ from ..model import get_matched_model_meta
7
+ from ..template import get_template_meta
8
+ from .deploy_args import DeployArguments
9
+ from .webui_args import WebUIArguments
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @dataclass
15
+ class AppArguments(WebUIArguments, DeployArguments):
16
+ base_url: Optional[str] = None
17
+ studio_title: Optional[str] = None
18
+ is_multimodal: Optional[bool] = None
19
+
20
+ lang: Literal['en', 'zh'] = 'en'
21
+ verbose: bool = False
22
+
23
+ def _init_torch_dtype(self) -> None:
24
+ if self.base_url:
25
+ self.model_meta = get_matched_model_meta(self.model)
26
+ return
27
+ super()._init_torch_dtype()
28
+
29
+ def __post_init__(self):
30
+ super().__post_init__()
31
+ self.server_port = find_free_port(self.server_port)
32
+ if self.model_meta:
33
+ if self.system is None:
34
+ self.system = get_template_meta(self.model_meta.template).default_system
35
+ if self.is_multimodal is None:
36
+ self.is_multimodal = self.model_meta.is_multimodal
37
+ if self.is_multimodal is None:
38
+ self.is_multimodal = False
ms-swift/swift/llm/data_loader.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ import torch.distributed as dist
5
+ from torch.utils.data import DataLoader
6
+
7
+
8
+ class BatchSamplerShard:
9
+
10
+ def __init__(self, total_samples: int, batch_size: int, shuffle: bool, drop_last: bool, data_seed: Optional[int]):
11
+ self.total_samples = total_samples // self.world_size
12
+ self.batch_size = batch_size
13
+ self.shuffle = shuffle
14
+ self.drop_last = drop_last
15
+ self.base_seed = data_seed or 0
16
+ self.curr_seed = self.base_seed
17
+
18
+ @property
19
+ def rank(self):
20
+ return dist.get_rank() if dist.is_initialized() else 0
21
+
22
+ @property
23
+ def world_size(self):
24
+ return dist.get_world_size() if dist.is_initialized() else 1
25
+
26
+ def __iter__(self):
27
+ start_idx = self.rank * self.total_samples
28
+ if self.shuffle:
29
+ generator = torch.Generator()
30
+ generator.manual_seed(self.curr_seed)
31
+ total_idx = torch.randperm(self.total_samples * self.world_size, generator=generator).tolist()
32
+ total_idx = total_idx[start_idx:start_idx + self.total_samples]
33
+ else:
34
+ total_idx = list(range(start_idx, start_idx + self.total_samples))
35
+
36
+ batch = []
37
+ # Last batch if not complete will be dropped.
38
+ for idx in total_idx:
39
+ batch.append(idx)
40
+ if len(batch) == self.batch_size:
41
+ yield batch
42
+ batch = []
43
+ if not self.drop_last and len(batch) > 0:
44
+ yield batch
45
+ return
46
+
47
+ def set_epoch(self, epoch: int):
48
+ self.curr_seed = self.base_seed + epoch
49
+
50
+ def __len__(self) -> int:
51
+ if self.drop_last:
52
+ return self.total_samples // self.batch_size
53
+ else:
54
+ return (self.total_samples + self.batch_size - 1) // self.batch_size
55
+
56
+
57
+ class DataLoaderShard(DataLoader):
58
+
59
+ def __init__(self, dataset, batch_sampler: BatchSamplerShard, **dataloader_params):
60
+ self.batch_sampler = batch_sampler
61
+ super().__init__(dataset, batch_sampler=self.batch_sampler, **dataloader_params)
62
+
63
+ def set_epoch(self, epoch: int):
64
+ self.batch_sampler.set_epoch(epoch)
65
+
66
+
67
+ class DataLoaderDispatcher:
68
+
69
+ def __init__(self, base_dataloader):
70
+ self.base_dataloader = base_dataloader
71
+
72
+ @property
73
+ def rank(self):
74
+ return dist.get_rank(self.group) if dist.is_initialized() else 0
75
+
76
+ @property
77
+ def world_size(self):
78
+ return dist.get_world_size(self.group) if dist.is_initialized() else 1
79
+
80
+ @property
81
+ def group(self):
82
+ return dist.group.WORLD if dist.is_initialized() else 1
83
+
84
+ def _scatter_object_list(self, inputs):
85
+ if not dist.is_initialized():
86
+ return inputs[0]
87
+ outputs = [None]
88
+ global_src_rank = dist.get_global_rank(self.group, 0)
89
+ dist.scatter_object_list(outputs, inputs, global_src_rank, group=self.group)
90
+ return outputs[0]
91
+
92
+ def __iter__(self):
93
+ base_iter = iter(self.base_dataloader)
94
+ while True:
95
+ if self.rank == 0:
96
+ try:
97
+ data = [next(base_iter) for _ in range(self.world_size)]
98
+ except StopIteration:
99
+ data = [None] * self.world_size
100
+ data = self._scatter_object_list(data)
101
+ else:
102
+ data = self._scatter_object_list(None)
103
+ if data is None:
104
+ break
105
+ yield data
ms-swift/swift/llm/dataset/dataset/__pycache__/mllm.cpython-310.pyc ADDED
Binary file (36 kB). View file
 
ms-swift/swift/llm/dataset/dataset/llm.py ADDED
@@ -0,0 +1,856 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import ast
3
+ import re
4
+ from functools import partial
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
7
+ import json
8
+ import numpy as np
9
+
10
+ from ...template import split_str_parts_by
11
+ from ..preprocessor import (AlpacaPreprocessor, ClsGenerationPreprocessor, ClsPreprocessor, MessagesPreprocessor,
12
+ ResponsePreprocessor, RowPreprocessor, TextGenerationPreprocessor)
13
+ from ..register import DatasetMeta, SubsetDataset, register_dataset
14
+
15
+
16
+ class AlpacaZhPreprocessor(AlpacaPreprocessor):
17
+
18
+ @classmethod
19
+ def concat_inst_input(cls, instruction, input_):
20
+ if input_ and input_.startswith('输入:'):
21
+ input_ = input_[3:]
22
+ return super().concat_inst_input(instruction, input_)
23
+
24
+
25
+ register_dataset(
26
+ DatasetMeta(
27
+ ms_dataset_id='AI-ModelScope/alpaca-gpt4-data-zh',
28
+ hf_dataset_id='llm-wizard/alpaca-gpt4-data-zh',
29
+ preprocess_func=AlpacaZhPreprocessor(),
30
+ tags=['chat', 'general', '🔥'],
31
+ ))
32
+
33
+
34
+ class LongAlpacaPreprocessor(AlpacaPreprocessor):
35
+
36
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
37
+ response = row['response']
38
+ prefix_prompt = 'Answer: '
39
+ if response and response.startswith(prefix_prompt):
40
+ response = response[len(prefix_prompt):].strip()
41
+ row['output'] = response
42
+ return super().preprocess(row)
43
+
44
+
45
+ register_dataset(
46
+ DatasetMeta(
47
+ ms_dataset_id='AI-ModelScope/LongAlpaca-12k',
48
+ hf_dataset_id='Yukang/LongAlpaca-12k',
49
+ preprocess_func=LongAlpacaPreprocessor(),
50
+ tags=['long-sequence', 'QA'],
51
+ ))
52
+
53
+
54
+ class RuozhibaPreprocessor(RowPreprocessor):
55
+
56
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
57
+ title = row['title'] if row.get('title', None) is not None else row['content']
58
+ abs = row['abs'] if 'abs' in row else None
59
+ if abs and abs != title:
60
+ title = title + ',' + abs
61
+
62
+ pattern = r'\d+[\.,\s,\、](.+)'
63
+ match = re.search(pattern, title)
64
+ if match:
65
+ title = match.group(1)
66
+ if title:
67
+ return {'messages': [{'role': 'assistant', 'content': title}]}
68
+
69
+
70
+ register_dataset(
71
+ DatasetMeta(
72
+ ms_dataset_id='AI-ModelScope/ruozhiba',
73
+ subsets=['post-annual', 'title-good', 'title-norm'],
74
+ preprocess_func=RuozhibaPreprocessor(),
75
+ tags=['pretrain', '🔥']))
76
+
77
+
78
+ class MathTrnPreprocessor(ResponsePreprocessor):
79
+
80
+ def preprocess(self, row):
81
+ query = row['query']
82
+ output = row['response']
83
+ row = {
84
+ 'query': query,
85
+ 'response': output,
86
+ }
87
+ return super().preprocess(row)
88
+
89
+
90
+ register_dataset(
91
+ DatasetMeta(ms_dataset_id='AI-ModelScope/math-trn-format', preprocess_func=MathTrnPreprocessor(), tags=['math']))
92
+
93
+
94
+ def _repair_ms_bench(messages: str) -> Optional[List[Dict[str, str]]]:
95
+ if isinstance(messages, str):
96
+ messages = ast.literal_eval(messages)
97
+ default_system = 'You are a helpful assistant.'
98
+ messages: List[Dict[str, str]]
99
+ if messages[0]['from'] == 'system' and messages[0]['value'] == default_system:
100
+ messages.pop(0)
101
+ # skip MOSS
102
+ for c in messages:
103
+ value = c['value'].lower()
104
+ if 'moss' in value or 'human:' in value or 'assistant:' in value or 'user:' in value:
105
+ return
106
+ return messages
107
+
108
+
109
+ register_dataset(
110
+ DatasetMeta(
111
+ ms_dataset_id='iic/ms_bench',
112
+ preprocess_func=MessagesPreprocessor(repair_messages=_repair_ms_bench),
113
+ tags=['chat', 'general', 'multi-round', '🔥']))
114
+
115
+
116
+ def _repair_agent_messages(messages: List[Dict[str, str]], use_mini: bool) -> Optional[List[Dict[str, str]]]:
117
+ if use_mini:
118
+ pattern = r'\d\. {"plugin_name": "(.+?)"'
119
+ if messages[0]['from'] != 'system':
120
+ return
121
+ system = messages[0]['value']
122
+ find_list = re.findall(pattern, system)
123
+ if len(set(find_list)) <= 1:
124
+ return
125
+ return messages
126
+
127
+
128
+ register_dataset(
129
+ DatasetMeta(
130
+ ms_dataset_id='damo/MSAgent-Bench',
131
+ subsets=[
132
+ SubsetDataset(
133
+ preprocess_func=MessagesPreprocessor(repair_messages=partial(_repair_agent_messages, use_mini=False))),
134
+ SubsetDataset(
135
+ name='mini',
136
+ preprocess_func=MessagesPreprocessor(repair_messages=partial(_repair_agent_messages, use_mini=True)),
137
+ is_weak_subset=True)
138
+ ],
139
+ split=['train', 'validation'],
140
+ tags=['chat', 'agent', 'multi-round']))
141
+
142
+ advertise_gen_prompt = """Task: Generating advertisements based on keywords.
143
+ Keywords: {{QUERY}}
144
+ Advertisements:"""
145
+
146
+ register_dataset(
147
+ DatasetMeta(
148
+ ms_dataset_id='lvjianjin/AdvertiseGen',
149
+ hf_dataset_id='shibing624/AdvertiseGen',
150
+ preprocess_func=TextGenerationPreprocessor(
151
+ prompt=advertise_gen_prompt, columns={
152
+ 'content': 'query',
153
+ 'summary': 'response'
154
+ }),
155
+ tags=['text-generation', '🔥'],
156
+ split=['train', 'validation'],
157
+ ))
158
+
159
+
160
+ class FireflyPreprocessor(ResponsePreprocessor):
161
+ _firefly_kind_list = {
162
+ 'ProseGeneration', 'MRC', 'JinYongGeneration', 'TextCorrection', 'ClassicalChinese', 'BELLE', 'StoryGeneration',
163
+ 'Couplet', 'Cot', 'Dictionary', 'Translation', 'Program', 'SentimentAnalyze', 'OpenQA', 'AncientPoem',
164
+ 'TextMatching', 'NLI', 'Summary', 'KeywordRecognition', 'ProductDesc', 'LyricGeneration', 'Composition',
165
+ 'MusicComment', 'NER'
166
+ }
167
+
168
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
169
+ if row['kind'] not in FireflyPreprocessor._firefly_kind_list:
170
+ return
171
+ return super().preprocess(row)
172
+
173
+
174
+ register_dataset(
175
+ DatasetMeta(
176
+ ms_dataset_id='AI-ModelScope/firefly-train-1.1M',
177
+ hf_dataset_id='YeungNLP/firefly-train-1.1M',
178
+ preprocess_func=FireflyPreprocessor(),
179
+ tags=['chat', 'general'],
180
+ ))
181
+
182
+ register_dataset(
183
+ DatasetMeta(
184
+ ms_dataset_id='modelscope/clue',
185
+ hf_dataset_id='clue',
186
+ subsets=['cmnli'],
187
+ preprocess_func=ClsGenerationPreprocessor(['neutral', 'entailment', 'contradiction'],
188
+ task='Natural Language Inference',
189
+ is_pair_seq=True),
190
+ tags=['text-generation', 'classification'],
191
+ split=['train', 'validation'],
192
+ ))
193
+
194
+ register_dataset(
195
+ DatasetMeta(
196
+ ms_dataset_id='DAMO_NLP/jd',
197
+ subsets=[
198
+ SubsetDataset(
199
+ 'default',
200
+ 'default',
201
+ preprocess_func=ClsGenerationPreprocessor(['negative', 'positive'],
202
+ task='Sentiment Classification',
203
+ is_pair_seq=False)),
204
+ SubsetDataset(
205
+ 'cls',
206
+ 'default',
207
+ preprocess_func=ClsPreprocessor(columns={'sentence': 'query'}),
208
+ ),
209
+ ],
210
+ tags=['text-generation', 'classification', '🔥'],
211
+ split=['train', 'validation'],
212
+ ))
213
+
214
+
215
+ class SyntheticText2SqlPreprocessor(ResponsePreprocessor):
216
+
217
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
218
+ sql_prompt = row['sql_prompt']
219
+ sql_context = row['sql_context']
220
+ sql = row['sql']
221
+ sql_explanation = row['sql_explanation']
222
+ query = f'Sql Table information:\n{sql_context}\n{sql_prompt}'
223
+ response = f'Let\'s think step by step:\n{sql_explanation}\nSo the final sql is:\n{sql}'
224
+ return super().preprocess({'query': query, 'response': response})
225
+
226
+
227
+ register_dataset(
228
+ DatasetMeta(
229
+ ms_dataset_id='AI-ModelScope/synthetic_text_to_sql',
230
+ hf_dataset_id='gretelai/synthetic_text_to_sql',
231
+ preprocess_func=SyntheticText2SqlPreprocessor(),
232
+ tags=['nl2sql', 'en']))
233
+
234
+
235
+ def _repair_toolbench(conversations: List[Dict[str, str]]) -> List[Dict[str, str]]:
236
+ assert len(conversations) == 2
237
+ if conversations[1]['from'] in {'caller', 'conclusion'}:
238
+ conversations[1]['from'] = 'assistant'
239
+ return conversations
240
+
241
+
242
+ register_dataset(
243
+ DatasetMeta(
244
+ ms_dataset_id='shenweizhou/alpha-umi-toolbench-processed-v2',
245
+ subsets=['backbone', 'caller', 'planner', 'summarizer'],
246
+ preprocess_func=MessagesPreprocessor(repair_messages=_repair_toolbench),
247
+ tags=['chat', 'agent', '🔥'],
248
+ huge_dataset=True))
249
+
250
+
251
+ class BlossomMathPreprocessor(ResponsePreprocessor):
252
+
253
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
254
+ output, answer = row['output'], row['answer']
255
+ return super().preprocess({'query': row['query'], 'response': f'{output}\n\nAnswer: {answer}'})
256
+
257
+
258
+ register_dataset(
259
+ DatasetMeta(
260
+ ms_dataset_id='AI-ModelScope/blossom-math-v2',
261
+ hf_dataset_id='Azure99/blossom-math-v2',
262
+ preprocess_func=BlossomMathPreprocessor(),
263
+ tags=['chat', 'math', '🔥']))
264
+
265
+ register_dataset(
266
+ DatasetMeta(
267
+ ms_dataset_id='AI-ModelScope/sql-create-context',
268
+ hf_dataset_id='b-mc2/sql-create-context',
269
+ preprocess_func=AlpacaPreprocessor(columns={
270
+ 'question': 'instruction',
271
+ 'context': 'input',
272
+ 'answer': 'output'
273
+ }),
274
+ tags=['chat', 'sql', '🔥']))
275
+
276
+
277
+ class TigerBotLawPreprocessor(ResponsePreprocessor):
278
+
279
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
280
+ prompt = """{type}
281
+ {title}
282
+ """
283
+ cur_prompt = prompt.format(type=row['type'], title=row['title'])
284
+ for i in range(1, 4):
285
+ chapter = row[f'chapter{i}']
286
+ if chapter is not None:
287
+ cur_prompt += f'{chapter}'
288
+ cur_prompt += f'{row["response"]}'
289
+ return super().preprocess({'response': cur_prompt})
290
+
291
+
292
+ register_dataset(
293
+ DatasetMeta(
294
+ ms_dataset_id='AI-ModelScope/tigerbot-law-plugin',
295
+ hf_dataset_id='TigerResearch/tigerbot-law-plugin',
296
+ preprocess_func=TigerBotLawPreprocessor(),
297
+ tags=['text-generation', 'law', 'pretrained']))
298
+
299
+ register_dataset(
300
+ DatasetMeta(
301
+ ms_dataset_id='codefuse-ai/CodeExercise-Python-27k',
302
+ preprocess_func=MessagesPreprocessor(columns={'chat_rounds': 'messages'}),
303
+ tags=['chat', 'coding', '🔥']))
304
+
305
+
306
+ class LeetcodePythonPreprocessor(ResponsePreprocessor):
307
+
308
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
309
+ code_with_problem = row['code_with_problem']
310
+ idx = code_with_problem.find('```python')
311
+ problem = code_with_problem[:idx]
312
+ if problem.startswith('# '):
313
+ problem = problem[2:]
314
+ code = code_with_problem[idx:].strip()
315
+ explanation = row['explanation_only']
316
+ return super().preprocess({'query': problem, 'response': f'{code}\n\n{explanation}'})
317
+
318
+
319
+ register_dataset(
320
+ DatasetMeta(
321
+ ms_dataset_id='AI-ModelScope/leetcode-solutions-python',
322
+ preprocess_func=LeetcodePythonPreprocessor(),
323
+ tags=['chat', 'coding', '🔥']))
324
+
325
+
326
+ class StsbPreprocessor(ResponsePreprocessor):
327
+
328
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
329
+ row = {
330
+ 'query': row['sentence1'],
331
+ 'response': row['sentence2'],
332
+ 'label': row['score'],
333
+ }
334
+ return super().preprocess(row)
335
+
336
+
337
+ class StsbGeneratePreprocessor(ResponsePreprocessor):
338
+ prompt = """Task: Based on the given two sentences, provide a similarity score between 0.0 and 1.0.
339
+ Sentence 1: {text1}
340
+ Sentence 2: {text2}
341
+ Similarity score: """
342
+
343
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
344
+ return super().preprocess({
345
+ 'query': self.prompt.format(text1=row['sentence1'], text2=row['sentence2']),
346
+ 'response': f"{row['score']:.1f}"
347
+ })
348
+
349
+
350
+ class StsbRegressionPreprocessor(StsbGeneratePreprocessor):
351
+
352
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
353
+ return super(StsbGeneratePreprocessor, self).preprocess({
354
+ 'query':
355
+ self.prompt.format(text1=row['sentence1'], text2=row['sentence2']),
356
+ 'label':
357
+ row['score']
358
+ })
359
+
360
+
361
+ register_dataset(
362
+ DatasetMeta(
363
+ ms_dataset_id='sentence-transformers/stsb',
364
+ hf_dataset_id='sentence-transformers/stsb',
365
+ subsets=[
366
+ SubsetDataset('default', preprocess_func=StsbPreprocessor()), # embedding
367
+ SubsetDataset('generate', preprocess_func=StsbGeneratePreprocessor()),
368
+ SubsetDataset('reg', preprocess_func=StsbRegressionPreprocessor()),
369
+ ],
370
+ tags=['similarity', '🔥']))
371
+
372
+
373
+ def _repair_conversations_agent_instruct(s: str) -> List[Dict[str, Any]]:
374
+ s = s.replace('}\n {', '},\n {')
375
+ if isinstance(s, str):
376
+ s = ast.literal_eval(s)
377
+ return s
378
+
379
+
380
+ register_dataset(
381
+ DatasetMeta(
382
+ ms_dataset_id='huangjintao/AgentInstruct_copy',
383
+ subsets=['alfworld', 'db', 'kg', 'mind2web', 'os', 'webshop'],
384
+ preprocess_func=MessagesPreprocessor(repair_messages=_repair_conversations_agent_instruct),
385
+ tags=['chat', 'agent', 'multi-round']))
386
+
387
+
388
+ class MultiRoleAgentPreprocessor(RowPreprocessor):
389
+
390
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
391
+ conv = row['conversations']
392
+ res_prompt = '\n\n【注意事项】\n1. 这是聊天室,不要发送私信给任何人\n2. 仅代表你个人说话,不要扮演其他人,只根据对话历史进行回复\n3. 长话短说,不要说太多话,不要超过50字 '
393
+ history_prompt = '\n\n【chat history】'
394
+ conv_prompt = '\n {name}:{content}'
395
+ query, response = '', conv[-1]['value']
396
+ system = conv[0]['value'] if conv[0]['from'] == 'system' else ''
397
+ if conv[0]['from'] == 'user':
398
+ query = conv[0]['value']
399
+ elif 'next_speakers:' not in system:
400
+ if '【注意事项】' not in system and system:
401
+ system += res_prompt
402
+ system += history_prompt
403
+ system += ''.join([conv_prompt.format(name=c['from'], content=c['value']) for c in conv[1:-1]])
404
+
405
+ if not query or not response:
406
+ return
407
+
408
+ return {
409
+ 'messages': [{
410
+ 'role': 'system',
411
+ 'content': system
412
+ }, {
413
+ 'role': 'user',
414
+ 'content': query
415
+ }, {
416
+ 'role': 'assistant',
417
+ 'content': response
418
+ }],
419
+ }
420
+
421
+
422
+ register_dataset(
423
+ DatasetMeta(
424
+ ms_dataset_id='iic/MSAgent-MultiRole',
425
+ preprocess_func=MultiRoleAgentPreprocessor(),
426
+ tags=['chat', 'agent', 'multi-round', 'role-play', 'multi-agent']))
427
+
428
+ register_dataset(DatasetMeta(ms_dataset_id='swift/ToolBench', tags=['chat', 'agent', 'multi-round']))
429
+
430
+ register_dataset(
431
+ DatasetMeta(
432
+ ms_dataset_id='tastelikefeet/competition_math',
433
+ subsets=[
434
+ SubsetDataset(
435
+ name='default',
436
+ subset='default',
437
+ split=['train', 'test'],
438
+ ),
439
+ ],
440
+ tags=['qa', 'math']))
441
+
442
+ register_dataset(DatasetMeta(ms_dataset_id='modelscope/gsm8k', subsets=['main'], split=['train'], tags=['qa', 'math']))
443
+
444
+ register_dataset(
445
+ DatasetMeta(ms_dataset_id='modelscope/MathR', subsets=['default', 'clean'], split=['train'], tags=['qa', 'math']))
446
+
447
+ register_dataset(
448
+ DatasetMeta(ms_dataset_id='modelscope/MathR-32B-Distill', subsets=['data'], split=['train'], tags=['qa', 'math']))
449
+
450
+
451
+ class CoundownTaskPreprocessor(ResponsePreprocessor):
452
+
453
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
454
+ numbers = row['nums']
455
+ target = row.pop('response', None)
456
+ query = (f'Using the numbers {numbers}, create an equation that equals {target}.\n'
457
+ 'You can use basic arithmetic operations (+, -, *, /) and each number can only be used once.\n'
458
+ 'Show your work in <think> </think> tags. And return the final equation and answer '
459
+ 'in <answer> </answer> tags, for example <answer> (1 + 2) / 3 * 4 = 4 </answer>.')
460
+ row.update({'target': target, 'query': query})
461
+ return super().preprocess(row)
462
+
463
+
464
+ register_dataset(
465
+ DatasetMeta(
466
+ ms_dataset_id='zouxuhong/Countdown-Tasks-3to4',
467
+ subsets=['default'],
468
+ preprocess_func=CoundownTaskPreprocessor(),
469
+ tags=['math']))
470
+
471
+
472
+ class HC3Preprocessor(ResponsePreprocessor):
473
+ prompt = """Classification Task: Are the following responses from a human or from ChatGPT?
474
+ Question: {question}
475
+ Answer: {answer}
476
+ Category: Human, ChatGPT
477
+ Output:"""
478
+
479
+ def preprocess(self, row):
480
+ rows = []
481
+ for response in ['Human', 'ChatGPT']:
482
+ query = self.prompt.format(
483
+ question=row['query'], answer=self.random_state.choice(row[f'{response.lower()}_answers']))
484
+ rows.append(super().preprocess({'query': query, 'response': response}))
485
+ return rows
486
+
487
+
488
+ class HC3ClsPreprocessor(HC3Preprocessor):
489
+
490
+ def preprocess(self, row):
491
+ rows = []
492
+ for i, response in enumerate(['Human', 'ChatGPT']):
493
+ query = self.prompt.format(
494
+ question=row['query'], answer=self.random_state.choice(row[f'{response.lower()}_answers']))
495
+ rows.append(ResponsePreprocessor.preprocess(self, {'query': query, 'label': i}))
496
+ return rows
497
+
498
+
499
+ hc3_subset_names = ['baike', 'open_qa', 'nlpcc_dbqa', 'finance', 'medicine', 'law', 'psychology']
500
+ hc3_subsets: List[SubsetDataset] = []
501
+ for hc3_subset_name in hc3_subset_names:
502
+ hc3_subsets.append(
503
+ SubsetDataset(
504
+ name=hc3_subset_name,
505
+ subset=hc3_subset_name,
506
+ preprocess_func=HC3Preprocessor(),
507
+ ))
508
+ hc3_subsets.append(
509
+ SubsetDataset(
510
+ name=f'{hc3_subset_name}_cls',
511
+ subset=hc3_subset_name,
512
+ preprocess_func=HC3ClsPreprocessor(),
513
+ ))
514
+
515
+ register_dataset(
516
+ DatasetMeta(
517
+ ms_dataset_id='simpleai/HC3-Chinese',
518
+ hf_dataset_id='Hello-SimpleAI/HC3-Chinese',
519
+ subsets=hc3_subsets,
520
+ tags=['text-generation', 'classification', '🔥']))
521
+
522
+ hc3_subset_names = ['finance', 'medicine']
523
+ hc3_subsets: List[SubsetDataset] = []
524
+ for hc3_subset_name in hc3_subset_names:
525
+ hc3_subsets.append(
526
+ SubsetDataset(
527
+ name=hc3_subset_name,
528
+ subset=hc3_subset_name,
529
+ preprocess_func=HC3Preprocessor(),
530
+ ))
531
+ hc3_subsets.append(
532
+ SubsetDataset(
533
+ name=f'{hc3_subset_name}_cls',
534
+ subset=hc3_subset_name,
535
+ preprocess_func=HC3ClsPreprocessor(),
536
+ ))
537
+
538
+ register_dataset(
539
+ DatasetMeta(
540
+ ms_dataset_id='simpleai/HC3',
541
+ hf_dataset_id='Hello-SimpleAI/HC3',
542
+ subsets=hc3_subsets,
543
+ preprocess_func=HC3Preprocessor(),
544
+ tags=['text-generation', 'classification', '🔥']))
545
+
546
+
547
+ class DureaderPreprocessor(RowPreprocessor):
548
+
549
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
550
+ prompt = """Task: Question Generation
551
+ Context: {context}
552
+ Answer: {answer}
553
+ Question:"""
554
+ answer, context = row['text1'].split('[SEP]')
555
+ return {
556
+ 'messages': [{
557
+ 'role': 'user',
558
+ 'content': prompt.format(context=context, answer=answer)
559
+ }, {
560
+ 'role': 'assistant',
561
+ 'content': row['text2']
562
+ }]
563
+ }
564
+
565
+
566
+ register_dataset(
567
+ DatasetMeta(
568
+ ms_dataset_id='modelscope/DuReader_robust-QG',
569
+ preprocess_func=DureaderPreprocessor(),
570
+ split=['train', 'validation', 'test'],
571
+ tags=['text-generation', '🔥']))
572
+
573
+
574
+ class HHRLHFPreprocessor(RowPreprocessor):
575
+
576
+ @staticmethod
577
+ def _to_messages(data):
578
+ messages = []
579
+ for query, response in zip(data[::2], data[1::2]):
580
+ messages.append({'role': 'user', 'content': query})
581
+ messages.append({'role': 'assistant', 'content': response})
582
+ return messages
583
+
584
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
585
+ chosen = row['chosen'].strip()
586
+ rejected = row['rejected'].strip()
587
+ parts_chosen = [s.strip() for s in re.split('\n\nHuman:|\n\nAssistant:|\n\nHum:', chosen)]
588
+ parts_rejected = [s.strip() for s in re.split('\n\nHuman:|\n\nAssistant:|\n\nHum:', rejected)]
589
+ if parts_chosen[0].startswith('Human:'):
590
+ assert parts_rejected[0].startswith('Human:')
591
+ parts_chosen[0] = parts_chosen[0][6:].strip()
592
+ parts_rejected[0] = parts_rejected[0][6:].strip()
593
+ row['messages'] = self._to_messages(parts_chosen)
594
+ row['rejected_messages'] = self._to_messages(parts_rejected)
595
+ return row
596
+
597
+
598
+ # TODO meta file broken
599
+ register_dataset(
600
+ DatasetMeta(
601
+ ms_dataset_id='AI-ModelScope/hh-rlhf',
602
+ subsets=['helpful-base', 'helpful-online', 'helpful-rejection-sampled'],
603
+ preprocess_func=HHRLHFPreprocessor(),
604
+ split=['train', 'test'],
605
+ tags=['rlhf', 'dpo'],
606
+ huge_dataset=True))
607
+
608
+
609
+ class XlamFunctionCallingPreprocessor(ResponsePreprocessor):
610
+
611
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
612
+ query = row['query']
613
+ answers = row['response']
614
+ if isinstance(answers, str):
615
+ answers = json.loads(answers)
616
+ answer = np.random.choice(answers)
617
+ name = answer['name']
618
+ args = json.dumps(answer['arguments'])
619
+ response = f'Action: {name}\nAction Input: {args}'
620
+ row = {'query': query, 'response': response, 'solution': response, 'tools': row['tools']}
621
+ return super().preprocess(row)
622
+
623
+
624
+ register_dataset(
625
+ DatasetMeta(
626
+ ms_dataset_id='LLM-Research/xlam-function-calling-60k',
627
+ subsets=['dataset'],
628
+ preprocess_func=XlamFunctionCallingPreprocessor(),
629
+ tags=['agent']))
630
+
631
+
632
+ class HHRLHFCNPreprocessor(MessagesPreprocessor):
633
+
634
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
635
+ row['messages'].append(row.pop('chosen'))
636
+ row['rejected_response'] = row['rejected']['text']
637
+ return super().preprocess(row)
638
+
639
+
640
+ register_dataset(
641
+ DatasetMeta(
642
+ ms_dataset_id='AI-ModelScope/hh_rlhf_cn',
643
+ subsets=['hh_rlhf', 'harmless_base_cn', 'harmless_base_en', 'helpful_base_cn', 'helpful_base_en'],
644
+ preprocess_func=HHRLHFCNPreprocessor(columns={'context': 'messages'}, content_key='text'),
645
+ split=['train', 'test'],
646
+ tags=['rlhf', 'dpo', '🔥']))
647
+
648
+
649
+ def repair_conversations(s: Union[str, Any]) -> Any:
650
+ if isinstance(s, str):
651
+ s = s.replace('}\n {', '},{')
652
+ s = s.replace('}\n{', '},{')
653
+ s = s.replace('}{', '},{')
654
+ s = s.replace('}\n {', '},{')
655
+ return ast.literal_eval(s)
656
+ return s
657
+
658
+
659
+ register_dataset(
660
+ DatasetMeta(
661
+ ms_dataset_id='AI-ModelScope/lmsys-chat-1m',
662
+ hf_dataset_id='lmsys/lmsys-chat-1m',
663
+ preprocess_func=MessagesPreprocessor(repair_messages=repair_conversations),
664
+ tags=['chat', 'em']))
665
+
666
+ register_dataset(
667
+ DatasetMeta(
668
+ ms_dataset_id='hjh0119/shareAI-Llama3-DPO-zh-en-emoji',
669
+ hf_dataset_id='shareAI/DPO-zh-en-emoji',
670
+ preprocess_func=ResponsePreprocessor(columns={
671
+ 'answer_zh': 'response',
672
+ 'answer_en': 'rejected_response'
673
+ }),
674
+ tags=['rlhf', 'dpo']))
675
+
676
+ register_dataset(
677
+ DatasetMeta(ms_dataset_id='AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto', tags=['rlhf', 'kto']))
678
+
679
+ register_dataset(
680
+ DatasetMeta(
681
+ ms_dataset_id='OmniData/Zhihu-KOL-More-Than-100-Upvotes',
682
+ hf_dataset_id='bzb2023/Zhihu-KOL-More-Than-100-Upvotes',
683
+ tags=['zhihu', 'qa']))
684
+
685
+ register_dataset(
686
+ DatasetMeta(
687
+ ms_dataset_id='OmniData/Zhihu-KOL',
688
+ hf_dataset_id='wangrui6/Zhihu-KOL',
689
+ huge_dataset=True,
690
+ tags=['zhihu', 'qa'],
691
+ ))
692
+
693
+
694
+ class GuanacoPreprocessor(RowPreprocessor):
695
+
696
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
697
+ instruction = row['instruction']
698
+ input = row['input']
699
+ output = row['output']
700
+ history = []
701
+ if instruction:
702
+ parts = split_str_parts_by(
703
+ instruction, ['User:', 'User:', 'Assistant:', 'Assistant:', 'Asssistent:', 'Assistent:', 'Assistenz:'])
704
+ for idx, part in enumerate(parts):
705
+ if idx % 2 == 0:
706
+ if 'user' not in part['key'].lower():
707
+ return
708
+ history.append([part['content'], None])
709
+ else:
710
+ if 'assist' not in part['key'].lower() and 'asssist' not in part['key'].lower():
711
+ return
712
+ history[-1][-1] = part['content']
713
+ if input.startswith('User:'):
714
+ input = input[len('User:'):].strip()
715
+ if any([not h[0] or not h[1] for h in history]):
716
+ return
717
+
718
+ messages = []
719
+ for h in history:
720
+ messages.append({'role': 'user', 'content': h[0]})
721
+ messages.append({'role': 'assistant', 'content': h[1]})
722
+ messages.append({'role': 'user', 'content': input})
723
+ messages.append({'role': 'assistant', 'content': output})
724
+ return {
725
+ 'messages': messages,
726
+ }
727
+
728
+
729
+ register_dataset(
730
+ DatasetMeta(
731
+ ms_dataset_id='AI-ModelScope/GuanacoDataset',
732
+ hf_dataset_id='JosephusCheung/GuanacoDataset',
733
+ preprocess_func=GuanacoPreprocessor(),
734
+ tags=['chat', 'zh']))
735
+
736
+
737
+ class FunctionCallChatmlPreprocessor(MessagesPreprocessor):
738
+
739
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
740
+ res = super().preprocess(row)
741
+
742
+ if res['function_description']:
743
+ res['tools'] = res['function_description'].split('\n\n')
744
+ messages = res['messages']
745
+ if messages[0]['role'] == 'system':
746
+ messages.pop(0)
747
+ return res
748
+
749
+
750
+ register_dataset(
751
+ DatasetMeta(
752
+ ms_dataset_id='AI-ModelScope/function-calling-chatml',
753
+ hf_dataset_id='Locutusque/function-calling-chatml',
754
+ preprocess_func=FunctionCallChatmlPreprocessor(),
755
+ tags=['agent', 'en', 'sft', '🔥']))
756
+
757
+
758
+ class Dolly15kPreprocessor(RowPreprocessor):
759
+
760
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
761
+ instruction = row['instruction']
762
+ context = row['context']
763
+ response = row['response']
764
+ query = ''
765
+ if context:
766
+ query = 'Here gives some useful information:\n'
767
+ query += context
768
+ query += '\n'
769
+ query += instruction
770
+ return {
771
+ 'messages': [{
772
+ 'role': 'user',
773
+ 'content': query
774
+ }, {
775
+ 'role': 'assistant',
776
+ 'content': response
777
+ }],
778
+ }
779
+
780
+
781
+ register_dataset(
782
+ DatasetMeta(
783
+ ms_dataset_id='AI-ModelScope/databricks-dolly-15k',
784
+ hf_dataset_id='databricks/databricks-dolly-15k',
785
+ preprocess_func=Dolly15kPreprocessor(),
786
+ tags=['multi-task', 'en', 'quality']))
787
+
788
+
789
+ class OrpoDPOMix40kPreprocessor(MessagesPreprocessor):
790
+
791
+ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
792
+ if row['source'] == 'toxic-dpo-v0.2':
793
+ return
794
+ return super().preprocess(row)
795
+
796
+
797
+ register_dataset(
798
+ DatasetMeta(
799
+ ms_dataset_id='AI-ModelScope/orpo-dpo-mix-40k',
800
+ hf_dataset_id='mlabonne/orpo-dpo-mix-40k',
801
+ preprocess_func=OrpoDPOMix40kPreprocessor(columns={
802
+ 'chosen': 'messages',
803
+ 'rejected': 'rejected_messages'
804
+ }),
805
+ tags=['dpo', 'orpo', 'en', 'quality']))
806
+
807
+ register_dataset(
808
+ DatasetMeta(
809
+ ms_dataset_id='swift/sharegpt',
810
+ subsets=['common-zh', 'unknow-zh', 'common-en'],
811
+ tags=['chat', 'general', 'multi-round']))
812
+
813
+
814
+ class SelfCognitionPreprocessor(ResponsePreprocessor):
815
+ name: Optional[Tuple[str, str]] = None
816
+ author: Optional[Tuple[str, str]] = None
817
+
818
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
819
+ for key in ['name', 'author']:
820
+ val = getattr(self, key)
821
+ if val is None:
822
+ continue
823
+ val = val[0] if row['tag'] == 'zh' else val[1]
824
+ if val is None:
825
+ continue
826
+ placeholder = '{{' + key.upper() + '}}'
827
+ row['query'] = row['query'].replace(placeholder, val)
828
+ row['response'] = row['response'].replace(placeholder, val)
829
+ return super().preprocess(row)
830
+
831
+
832
+ class Qwen3SelfCognitionPreprocessor(SelfCognitionPreprocessor):
833
+
834
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
835
+ row['query'] = row['query'] + ' /no_think'
836
+ row['response'] = '<think>\n\n</think>\n\n' + row['response']
837
+ return super().preprocess(row)
838
+
839
+
840
+ class EmptyThinkSelfCognitionPreprocessor(SelfCognitionPreprocessor):
841
+
842
+ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
843
+ row['response'] = '<think>\n\n</think>\n\n' + row['response']
844
+ return super().preprocess(row)
845
+
846
+
847
+ register_dataset(
848
+ DatasetMeta(
849
+ ms_dataset_id='swift/self-cognition',
850
+ hf_dataset_id='modelscope/self-cognition',
851
+ subsets=[
852
+ SubsetDataset(preprocess_func=SelfCognitionPreprocessor()),
853
+ SubsetDataset('qwen3', preprocess_func=Qwen3SelfCognitionPreprocessor()),
854
+ SubsetDataset('empty_think', preprocess_func=EmptyThinkSelfCognitionPreprocessor()),
855
+ ],
856
+ tags=['chat', 'self-cognition', '🔥']))
ms-swift/swift/llm/dataset/preprocessor/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (513 Bytes). View file
 
ms-swift/swift/llm/dataset/preprocessor/__pycache__/core.cpython-310.pyc ADDED
Binary file (18 kB). View file
 
ms-swift/swift/llm/dataset/register.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from copy import deepcopy
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
+
7
+ import json
8
+
9
+ from swift.utils import get_logger, use_hf_hub
10
+ from .preprocessor import DATASET_TYPE, AutoPreprocessor, MessagesPreprocessor
11
+
12
+ PreprocessFunc = Callable[..., DATASET_TYPE]
13
+ LoadFunction = Callable[..., DATASET_TYPE]
14
+ logger = get_logger()
15
+
16
+
17
+ @dataclass
18
+ class SubsetDataset:
19
+ # `Name` is used for matching subsets of the dataset, and `subset` refers to the subset_name on the hub.
20
+ name: Optional[str] = None
21
+ # If set to None, then subset is set to subset_name.
22
+ subset: str = 'default'
23
+
24
+ # Higher priority. If set to None, the attributes of the DatasetMeta will be used.
25
+ split: Optional[List[str]] = None
26
+ preprocess_func: Optional[PreprocessFunc] = None
27
+
28
+ # If the dataset specifies "all," weak subsets will be skipped.
29
+ is_weak_subset: bool = False
30
+
31
+ def __post_init__(self):
32
+ if self.name is None:
33
+ self.name = self.subset
34
+
35
+ def set_default(self, dataset_meta: 'DatasetMeta') -> 'SubsetDataset':
36
+ subset_dataset = deepcopy(self)
37
+ for k in ['split', 'preprocess_func']:
38
+ v = getattr(subset_dataset, k)
39
+ if v is None:
40
+ setattr(subset_dataset, k, deepcopy(getattr(dataset_meta, k)))
41
+ return subset_dataset
42
+
43
+
44
+ @dataclass
45
+ class DatasetMeta:
46
+ ms_dataset_id: Optional[str] = None
47
+ hf_dataset_id: Optional[str] = None
48
+ dataset_path: Optional[str] = None
49
+ dataset_name: Optional[str] = None
50
+ ms_revision: Optional[str] = None
51
+ hf_revision: Optional[str] = None
52
+
53
+ subsets: List[Union[SubsetDataset, str]] = field(default_factory=lambda: ['default'])
54
+ # Applicable to all subsets.
55
+ split: List[str] = field(default_factory=lambda: ['train'])
56
+ # First perform column mapping, then proceed with the preprocess_func.
57
+ preprocess_func: PreprocessFunc = field(default_factory=lambda: AutoPreprocessor())
58
+ load_function: Optional[LoadFunction] = None
59
+
60
+ tags: List[str] = field(default_factory=list)
61
+ help: Optional[str] = None
62
+ huge_dataset: bool = False
63
+
64
+ def __post_init__(self):
65
+ from .loader import DatasetLoader
66
+ if self.load_function is None:
67
+ self.load_function = DatasetLoader.load
68
+ for i, subset in enumerate(self.subsets):
69
+ if isinstance(subset, str):
70
+ self.subsets[i] = SubsetDataset(subset=subset)
71
+
72
+
73
+ DATASET_MAPPING: Dict[Tuple[str, str, str], DatasetMeta] = {}
74
+
75
+
76
+ def get_dataset_list():
77
+ datasets = []
78
+ for key in DATASET_MAPPING:
79
+ if use_hf_hub():
80
+ if key[1]:
81
+ datasets.append(key[1])
82
+ else:
83
+ if key[0]:
84
+ datasets.append(key[0])
85
+ return datasets
86
+
87
+
88
+ def register_dataset(dataset_meta: DatasetMeta, *, exist_ok: bool = False) -> None:
89
+ """Register dataset
90
+
91
+ Args:
92
+ dataset_meta: The `DatasetMeta` info of the dataset.
93
+ exist_ok: If the dataset id exists, raise error or update it.
94
+ """
95
+ if dataset_meta.dataset_name:
96
+ dataset_name = dataset_meta.dataset_name
97
+ else:
98
+ dataset_name = dataset_meta.ms_dataset_id, dataset_meta.hf_dataset_id, dataset_meta.dataset_path
99
+ if not exist_ok and dataset_name in DATASET_MAPPING:
100
+ raise ValueError(f'The `{dataset_name}` has already been registered in the DATASET_MAPPING.')
101
+
102
+ DATASET_MAPPING[dataset_name] = dataset_meta
103
+
104
+
105
+ def _preprocess_d_info(d_info: Dict[str, Any], *, base_dir: Optional[str] = None) -> Dict[str, Any]:
106
+ d_info = deepcopy(d_info)
107
+
108
+ columns = None
109
+ if 'columns' in d_info:
110
+ columns = d_info.pop('columns')
111
+
112
+ if 'messages' in d_info:
113
+ d_info['preprocess_func'] = MessagesPreprocessor(**d_info.pop('messages'), columns=columns)
114
+ else:
115
+ d_info['preprocess_func'] = AutoPreprocessor(columns=columns)
116
+
117
+ if 'dataset_path' in d_info:
118
+ dataset_path = d_info.pop('dataset_path')
119
+ if base_dir is not None and not os.path.isabs(dataset_path):
120
+ dataset_path = os.path.join(base_dir, dataset_path)
121
+ dataset_path = os.path.abspath(os.path.expanduser(dataset_path))
122
+
123
+ d_info['dataset_path'] = dataset_path
124
+
125
+ if 'subsets' in d_info:
126
+ subsets = d_info.pop('subsets')
127
+ for i, subset in enumerate(subsets):
128
+ if isinstance(subset, dict):
129
+ subsets[i] = SubsetDataset(**_preprocess_d_info(subset))
130
+ d_info['subsets'] = subsets
131
+ return d_info
132
+
133
+
134
+ def _register_d_info(d_info: Dict[str, Any], *, base_dir: Optional[str] = None) -> DatasetMeta:
135
+ """Register a single dataset to dataset mapping
136
+
137
+ Args:
138
+ d_info: The dataset info
139
+ """
140
+ d_info = _preprocess_d_info(d_info, base_dir=base_dir)
141
+ dataset_meta = DatasetMeta(**d_info)
142
+ register_dataset(dataset_meta)
143
+ return dataset_meta
144
+
145
+
146
+ def register_dataset_info(dataset_info: Union[str, List[str], None] = None) -> List[DatasetMeta]:
147
+ """Register dataset from the `dataset_info.json` or a custom dataset info file
148
+ This is used to deal with the datasets defined in the json info file.
149
+
150
+ Args:
151
+ dataset_info: The dataset info path
152
+ """
153
+ # dataset_info_path: path, json or None
154
+ if dataset_info is None:
155
+ dataset_info = os.path.join(os.path.dirname(__file__), 'data', 'dataset_info.json')
156
+ assert isinstance(dataset_info, (str, list))
157
+ base_dir = None
158
+ log_msg = None
159
+ if isinstance(dataset_info, str):
160
+ dataset_path = os.path.abspath(os.path.expanduser(dataset_info))
161
+ if os.path.isfile(dataset_path):
162
+ log_msg = dataset_path
163
+ base_dir = os.path.dirname(dataset_path)
164
+ with open(dataset_path, 'r', encoding='utf-8') as f:
165
+ dataset_info = json.load(f)
166
+ else:
167
+ dataset_info = json.loads(dataset_info) # json
168
+ if len(dataset_info) == 0:
169
+ return []
170
+ res = []
171
+ for d_info in dataset_info:
172
+ res.append(_register_d_info(d_info, base_dir=base_dir))
173
+
174
+ if log_msg is None:
175
+ log_msg = dataset_info if len(dataset_info) < 5 else list(dataset_info.keys())
176
+ logger.info(f'Successfully registered `{log_msg}`.')
177
+ return res
ms-swift/swift/llm/ds_config/zero0.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fp16": {
3
+ "enabled": "auto",
4
+ "loss_scale": 0,
5
+ "loss_scale_window": 1000,
6
+ "initial_scale_power": 16,
7
+ "hysteresis": 2,
8
+ "min_loss_scale": 1
9
+ },
10
+
11
+ "bf16": {
12
+ "enabled": "auto"
13
+ },
14
+
15
+ "zero_optimization": {
16
+ "stage": 0,
17
+ "allgather_partitions": true,
18
+ "allgather_bucket_size": 2e8,
19
+ "overlap_comm": false,
20
+ "reduce_scatter": true,
21
+ "reduce_bucket_size": 2e8,
22
+ "contiguous_gradients": true
23
+ },
24
+
25
+ "gradient_accumulation_steps": "auto",
26
+ "gradient_clipping": "auto",
27
+ "steps_per_print": 2000,
28
+ "train_batch_size": "auto",
29
+ "train_micro_batch_size_per_gpu": "auto",
30
+ "wall_clock_breakdown": false
31
+ }