Student0809 commited on Jun 6, 2025

Commit

6c17501

verified ·

1 Parent(s): 511365c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ms-swift/docs/source_en/BestPractices/More-Best-Practices.md +7 -0
ms-swift/docs/source_en/Customization/Custom-model.md +35 -0
ms-swift/docs/source_en/Customization/Pluginization.md +234 -0
ms-swift/docs/source_en/GetStarted/SWIFT-installation.md +88 -0
ms-swift/docs/source_en/Instruction/Agent-support.md +215 -0
ms-swift/docs/source_en/Instruction/Command-line-parameters.md +675 -0
ms-swift/docs/source_en/Instruction/Evaluation.md +270 -0
ms-swift/docs/source_en/Instruction/Frequently-asked-questions.md +716 -0
ms-swift/docs/source_en/Instruction/GRPO.md +471 -0
ms-swift/docs/source_en/Instruction/Inference-and-deployment.md +354 -0
ms-swift/docs/source_en/Instruction/Megatron-SWIFT-Training.md +305 -0
ms-swift/docs/source_en/Instruction/RLHF.md +114 -0
ms-swift/docs/source_en/Instruction/Reinforced-Fine-tuning.md +103 -0
ms-swift/docs/source_en/Instruction/Sample.md +100 -0
ms-swift/docs/source_en/Instruction/Supported-models-and-datasets.md +0 -0
ms-swift/docs/source_en/_templates/autosummary/class.rst +10 -0
ms-swift/docs/source_en/_templates/sobolengine.rst +14 -0
ms-swift/examples/README.md +13 -0
ms-swift/examples/app/base_url/demo.py +13 -0
ms-swift/examples/app/base_url/demo.sh +7 -0
ms-swift/examples/custom/dataset.py +30 -0
ms-swift/examples/custom/infer.sh +9 -0
ms-swift/examples/custom/model.py +33 -0
ms-swift/examples/deploy/agent/client.py +87 -0
ms-swift/examples/deploy/agent/server.sh +8 -0
ms-swift/examples/deploy/bert/client.py +29 -0
ms-swift/examples/deploy/client/llm/chat/openai_client.py +46 -0
ms-swift/examples/deploy/lora/server.sh +7 -0
ms-swift/examples/deploy/reward_model/client.py +17 -0
ms-swift/examples/deploy/reward_model/server.sh +5 -0
ms-swift/examples/deploy/server/demo.sh +14 -0
ms-swift/examples/eval/eval_url/demo.py +14 -0
ms-swift/examples/eval/eval_url/eval.sh +7 -0
ms-swift/examples/eval/train_eval/train.sh +24 -0
ms-swift/examples/eval/vlm/eval.sh +8 -0
ms-swift/examples/export/ollama.sh +4 -0
ms-swift/examples/export/push_to_hub.sh +6 -0
ms-swift/examples/export/quantize/awq.sh +12 -0
ms-swift/examples/export/quantize/bert/bnb.sh +16 -0
ms-swift/examples/export/quantize/bert/gptq.sh +18 -0
ms-swift/examples/export/quantize/bnb.sh +8 -0
ms-swift/examples/export/quantize/gptq.sh +13 -0
ms-swift/examples/export/quantize/mllm/awq.sh +19 -0
ms-swift/examples/export/quantize/moe/awq.sh +14 -0
ms-swift/examples/infer/demo_reward_model.py +31 -0
ms-swift/examples/infer/lmdeploy/ddp.sh +7 -0
ms-swift/examples/infer/lmdeploy/mllm_tp.sh +8 -0
ms-swift/examples/infer/pt/lora.sh +10 -0
ms-swift/examples/infer/pt/mllm_device_map.sh +9 -0
ms-swift/examples/infer/vllm/mllm_tp.sh +11 -0

ms-swift/docs/source_en/BestPractices/More-Best-Practices.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# More Best Practices
+- [Qwen2.5 self-cognition SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-self-cognition)
+- [Qwen2-VL Latex-OCR SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2vl-ocr)
+- [Qwen2.5-VL Grounding Task SFT](https://github.com/modelscope/ms-swift/tree/main/examples/notebook/qwen2_5-vl-grounding)
+- [Qwen3全流程最佳实践](https://github.com/modelscope/ms-swift/issues/4030)

ms-swift/docs/source_en/Customization/Custom-model.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Custom Model
+The models built into ms-swift can be used directly by specifying either `model_id` or `model_path`: `--model <model_id_or_path>`. ms-swift determines the `model_type` based on the suffix of `model_id/model_path` and the `config.json` file. Each `model_type` has a unique model structure, template, and loading method. Of course, you can also manually override these by passing `--model_type` and `--template`. You can check the supported `model_type` and templates in the [Supported Models and Datasets](../Instruction/Supported-models-and-datasets.md).
+## Model Registration
+Custom models are typically implemented using model registration. You can refer to the [built-in model](https://github.com/modelscope/ms-swift/blob/main/swift/llm/model/model/qwen.py), the [built-in dialogue template](https://github.com/modelscope/ms-swift/blob/main/swift/llm/template/template/qwen.py), or the example code in the [examples](https://github.com/modelscope/swift/blob/main/examples/custom). You can specify the `--custom_register_path xxx.py` to parse the externally registered content, which is convenient for users installing via pip instead of git clone.
+The `register_model` function registers a model in the `MODEL_MAPPING`. You can complete the model registration by calling the function `register_model(model_meta)`, where `model_meta` will store the model's metadata. The parameter list for ModelMeta is as follows:
+- model_type: Required. The model type, which is also the unique ID.
+- model_groups: Required. Lists the ModelScope/HuggingFace model IDs and local paths. Running the [run_model_info.py](https://github.com/modelscope/ms-swift/blob/main/scripts/utils/run_model_info.py) file will automatically generate the [supported models documentation](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html) and automatically match the model_type based on the `--model` suffix.
+- template: Required. The default template type when `--template` is not specified.
+- get_function: Required. The loading function for the model and tokenizer/processor (for multi-modal models). LLM is typically set to `get_model_tokenizer_with_flash_attn`.
+- model_arch: The model architecture. Defaults to None. Multi-modal model training requires setting this parameter to determine the prefix for llm/vit/aligner.
+- architectures: The architectures item in config.json, used to automatically match the model with its model_type. Defaults to `[]`.
+- additional_saved_files: Files that need to be additionally saved during full parameter training and merge-lora. Defaults to `[]`.
+- torch_dtype: The default dtype when `torch_dtype` is not passed during model loading. Defaults to None, read from config.json.
+- is_multimodal: Indicates whether the model is multi-modal. Defaults to False.
+- ignore_patterns: File patterns to be ignored when downloading from the hub. Defaults to `[]`.
+The `register_template` function registers a dialogue template in `TEMPLATE_MAPPING`. To complete the registration of the dialogue template, simply call the function `register_template(template_meta)`, where `template_meta` will store the metadata of the template. The parameter list for TemplateMeta is as follows:
+- template_type: Required. The type of dialogue template, which also serves as a unique ID.
+- prefix: Required. The prefix of the dialogue template, usually encompassing parts like system, bos_token, and is generated independently of multi-turn dialogue loops. For example, the prefix for qwen is `[]`.
+- prompt: Required. Represents the dialogue portion before `{{RESPONSE}}`. We use `{{QUERY}}` as a placeholder for the user's inquiry part. For example, the prompt for qwen is `['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n']`.
+- chat_sep: Required. The separator for each turn in multi-turn dialogues. If set to None, the template does not support multi-turn dialogue. For example, the chat_sep for qwen is `['<|im_end|>\n']`.
+- suffix: Defaults to `[['eos_token_id']]`. The suffix part of the dialogue template, generated independently of multi-turn dialogue loops, usually the eos_token. For example, the suffix for qwen is `['<|im_end|>']`.
+- template_cls: Defaults to `Template`. Customization is generally required when defining templates for multimodal models, particularly in customizing the `_encode`, `_post_encode`, and `_data_collator` functions.
+- system_prefix: Defaults to None. The prefix for dialogue templates with a system. We use`{{SYSTEM}}`as a placeholder for the system. For example, the system_prefix for qwen is`['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n']`.
+  - Note: If the system is empty and `prefix` can be replaced by `system_prefix`, you can write `prefix` as a prefix including the system without setting `system_prefix`.
+  - If the prefix does not include `{{SYSTEM}}` and system_prefix is not set, the template does not support the system.
+- default_system: Defaults to None. The default system used when `--system` is not provided. For example, the default_system for qwen is `'You are a helpful assistant.'`.
+- stop_words: Defaults to`[]`. Additional stop words besides eos_token and`suffix[-1]`. For example, the stop_words for qwen is`['<|endoftext|>']`
+  - Note: During inference, the output response will be filtered by eos_token and `suffix[-1]`, but additional stop_words will be retained.

ms-swift/docs/source_en/Customization/Pluginization.md ADDED Viewed

	@@ -0,0 +1,234 @@

+# Pluginization
+Pluginization is a significant new feature introduced in SWIFT 3.0. We aim to make the customization of the development process more natural for developers through a plugin-based approach.
+## Callback Mechanism
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/callback.py).
+The `callback` mechanism is a customization feature in the Transformers Trainer that allows developers to control the training process. Typically, customizing a callback looks like the following:
+```python
+class CustomCallback(TrainerCallback):
+    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # Doing something when the training begins.
+        pass
+    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # Doing something when saving a checkpoint.
+        pass
+```
+Callbacks are registered with the trainer before it is instantiated. The example provided demonstrates a simple version of an EarlyStopping mechanism. Registering your own callback is straightforward:
+```python
+extra_callbacks = [CustomCallback()]
+```
+Developers can add new callbacks in `plugin/callback.py` and customize their training process. For detailed parameters of callbacks, refer to [this documentation](https://huggingface.co/docs/transformers/main_classes/callback).
+## Customizing Loss
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/loss.py).
+SWIFT supports customizing the loss function through plugins. If this feature is not utilized, the default Cross Entropy Loss (CE Loss) is used. Developers can write code in this file to register their custom loss functions, and the trainer will automatically use the customized loss method.
+For example, adding the following code in `plugin/loss.py`:
+```python
+@register_loss_func("custom_loss")
+def loss_scale_func(outputs, labels, loss_scale=None, num_items_in_batch=None) -> torch.Tensor:
+    # Write your own loss calculation here
+    return loss
+```
+It is important to note that the loss function is strongly related to the training task. Currently, loss customization supports PT and SFT tasks. For human alignment tasks (e.g., DPO, PPO) or classification tasks (seq_cls), loss customization through plugins is not supported.
+## Customizing Loss Scale
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/loss_scale/loss_scale.py).
+The `loss_scale` mechanism is one of the crucial features in SWIFT. In PT and SFT tasks, the loss for trainable tokens is uniform, meaning each token is equally involved in backpropagation. However, in certain situations, some tokens require higher weights and extra attention. In such cases, `loss_scale` allows developers to define custom token weights.
+```python
+class LastRoundLossScale(LossScale):
+    def get_loss_scale(self, context: str, context_type: ContextType, is_last_round: bool, **kwargs):
+        if context_type == ContextType.RESPONSE:
+            return [context], [float(is_last_round)]
+        return super().get_loss_scale(context, context_type, is_last_round)
+```
+In the above code, a `Tuple` is returned where the first element is the `context` (or its split parts), and the second element is the corresponding `loss_scale`. The float value represents the weight. For example, the following weight settings:
+```text
+["学习", "好", "数学", "是", "重要", "的"]
+[1.0, 0.5, 2.0, 0.5, 2.0, 0.1]
+```
+Here, we place more emphasis on the words "数学" (mathematics) and "重要" (important) by increasing their weights to 2.0.
+Referring back to the code, we check if the provided `context` is a response. If it is a response and is the last round in a multi-turn dialogue, we return a `loss_scale` of `[1]`. In other cases, we use the base implementation (which sets `loss_scale` to `[0]`). This approach ensures that only the responses from the last round participate in training, while other responses do not. Using this method, we can make all tokens (prompts and responses) participate in training or focus on specific special characters of the agent for training, etc.
+In PT and SFT, `loss_scale` is uniformly supported (whether to participate in training and the size of the weights). However, in human alignment tasks, only the participation of certain tokens in training is supported, not the size of the weights.
+## Customizing Metrics
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/metric.py).
+Metrics can be customized to evaluate the training process:
+```python
+METRIC_MAPPING = {
+    'acc': (compute_acc_metrics, preprocess_logits_for_acc),
+    'nlg': (compute_nlg_metrics, None),
+    'custom': (custom_metric, custom_preprocess),
+}
+def get_metric(metric: str):
+    return METRIC_MAPPING[metric]
+```
+In the above definition, we added a new `custom` metric. Its value consists of two parts: the first is the metric computation process, which returns a dictionary containing metric key-value pairs, and the second is the preprocessing step for logits, which returns the actual predictions.
+## Customizing Optimizers
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/optimizer.py).
+- Apply different learning rates to different parts of the model. For example, use separate learning rates for ViT and LLM, as referenced [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/lora_llm_full_vit/custom_plugin.py).
+Users can add their own optimizers and learning rate schedulers here:
+```python
+def create_custom_optimizers(args, model, dataset):
+    # Create your own optimizer
+    return CustomOptimizer(optimizer_grouped_parameters, **optimizer_kwargs), CustomScheduler(...)
+optimizers_map = {
+    'custom': create_custom_optimizers,
+    ...
+}
+```
+When developers need to use other optimizers, such as those defined in new research papers, they can define their creation process here and specify the parameter:
+```shell
+--optimizer custom
+```
+This will invoke the custom optimizer.
+## Customizing Agent Template
+The example is [here](https://github.com/modelscope/swift/blob/main/swift/plugin/agent_template).
+## Customizing Tuners
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/tuner.py).
+- For the multimodal model, full-parameter training is applied to the ViT part, while LoRA training is used for the LLM part. Refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/lora_llm_full_vit).
+- For Phi4-multimodal, train its existing LoRA directly without adding extra LoRA. Refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/plugins/tuner_phi4_mm.sh).
+Tuner customization is another unique feature of SWIFT. Developers can bypass the complex tuner initialization process and code integration costs by registering new tuners here:
+```python
+class IA3(Tuner):
+    @staticmethod
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
+        model_arch: ModelKeys = MODEL_ARCH_MAPPING[model.model_meta.model_arch]
+        ia3_config = IA3Config(
+            target_modules=find_all_linears(model), feedforward_modules='.*' + model_arch.mlp.split('{}.')[1] + '.*')
+        return get_peft_model(model, ia3_config)
+    @staticmethod
+    def save_pretrained(
+        model: torch.nn.Module,
+        save_directory: str,
+        state_dict: Optional[dict] = None,
+        safe_serialization: bool = True,
+        **kwargs,
+    ) -> None:
+        model: PeftModel
+        model.save_pretrained(save_directory, state_dict=state_dict, safe_serialization=safe_serialization, **kwargs)
+    @staticmethod
+    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs) -> torch.nn.Module:
+        return PeftModel.from_pretrained(model, model_id, **kwargs)
+```
+In the above example, we apply PEFT's IA3 to model training. This class includes three methods:
+- `prepare_model`: How to wrap the original model using the tuner and set up trainable parameters.
+- `save_pretrained`: How to save the model during training.
+- `from_pretrained`: How to reload checkpoints saved earlier for subsequent training and inference.
+These three methods are invoked during the SWIFT training process, allowing developers to use their tuners without reading the complex training code.
+## PRM (Process Reward Model)
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/prm.py).
+PRM stands for Process Reward Model, which is used in the `swift sample` command. PRM needs to support simple interfaces:
+```python
+class PRM:
+    def __init__(self):
+        # init here
+        pass
+    def __call__(self, infer_requests: List[InferRequest], **kwargs) -> List[Union[float, List[float]]]:
+        raise NotImplementedError
+```
+The InferRequest comes from `swift.llm`, and the returned `List[Union[float, List[float]]]` may contain a reward or several rewards. Developers can access queries and responses in infer_requests and split them according to their own methods, for example:
+```text
+Let's think step by step.
+Step1: xxx
+Step2: xxx
+So, the answer is ...
+```
+Developers can split the process here, batch them into PRM for inference, and return rewards. More generally, developers can call a remote URL here, such as a closed-source PRM large model, and return rewards.
+## ORM (Outcome Reward Model)
+An example can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/orm.py).
+ORM stands for Outcome Reward Model. ORM typically uses regular expressions to determine whether a response is correct. For example:
+```python
+class MathORM(ORM):
+    @staticmethod
+    def extract_boxed_result(text):
+        pattern = r'\\boxed{([^}]*)}'
+        match = re.search(pattern, text)
+        if match:
+            return match.group(1).strip()
+        else:
+            return None
+    def __call__(self, infer_requests: List[InferRequest], ground_truths: List[str],
+                **kwargs) -> List[float]:
+        rewards = []
+        predictions = [request.messages[-1]['content'] for request in infer_requests]
+        for prediction, ground_truth in zip(predictions, ground_truths):
+            res1 = MathORM.extract_boxed_result(prediction) or ''
+            res2 = MathORM.extract_boxed_result(ground_truth) or ''
+            rewards.append(float(res1.strip() == res2.strip()))
+        return rewards
+orms = {
+    'math': MathORM,
+}
+```
+In the above code, we define a process to parse mathematical responses. If the results are the same, it returns a score of `1.0`; otherwise, it returns `0.0`. Unlike PRM, this class's `infer` method includes an additional parameter `ground_truths`, which corresponds to the actual labels (standard responses defined in the dataset) for the `infer_requests`.

ms-swift/docs/source_en/GetStarted/SWIFT-installation.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# SWIFT Installation
+## Wheel Packages Installation
+You can install it using pip:
+```shell
+pip install 'ms-swift'
+# For evaluation usage
+pip install 'ms-swift[eval]' -U
+# Full capabilities
+pip install 'ms-swift[all]' -U
+```
+## Source Code Installation
+```shell
+# pip install git+https://github.com/modelscope/ms-swift.git
+# Full capabilities
+# pip install "git+https://github.com/modelscope/ms-swift.git#egg=ms-swift[all]"
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e .
+# Full capabilities
+# pip install -e '.[all]'
+```
+## Older Versions
+SWIFT underwent an incompatible restructuring starting from version 3.0. If you need to use the old version 2.x, please execute the following command to install:
+```shell
+pip install ms-swift==2.*
+```
+## Mirror
+```
+# vllm0.8.3 (This version of vllm may cause some GRPO training to get stuck; it is recommended to use vllm0.7.3 for GRPO training as a priority).
+modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.3-modelscope1.25.0-swift3.3.0.post1
+modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.3-modelscope1.25.0-swift3.3.0.post1
+# vllm0.7.3
+modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.5.1-modelscope1.25.0-swift3.2.2
+```
+More images can be found [here](https://modelscope.cn/docs/intro/environment-setup#%E6%9C%80%E6%96%B0%E9%95%9C%E5%83%8F).
+## Supported Hardware
+| Hardware Environment | Remarks                                                |
+| -------------------- | ------------------------------------------------------ |
+| A10/A100/H100        |                                                        |
+| RTX 20/30/40 Series  |                                                        |
+| T4/V100              | Some models may encounter NAN                          |
+| Ascend NPU           | Some models may encounter NAN or unsupported operators |
+| MPS                  |                                                        |
+| CPU                  |                                                        |
+## Running Environment
+|              | Range        | Recommended | Notes                                     |
+| ------------ |--------------| ----------- | ----------------------------------------- |
+| python       | >=3.9        | 3.10        |                                           |
+| cuda         |              | cuda12      | No need to install if using CPU, NPU, MPS |
+| torch        | >=2.0        |             |                                           |
+| transformers | >=4.33       | 4.51      |                                           |
+| modelscope   | >=1.23       |             |                                           |
+| peft         | >=0.11,<0.16 |             |                                           |
+| trl          | >=0.13,<0.18 | 0.17      | RLHF                                      |
+| deepspeed    | >=0.14       | 0.14.5 | Training                                  |
+| vllm         | >=0.5.1      | 0.7.3/0.8       | Inference/Deployment/Evaluation           |
+| lmdeploy     | >=0.5        | 0.8       | Inference/Deployment/Evaluation           |
+| evalscope | >=0.11       | | Evaluation |
+For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
+## Notebook Environment
+Most models that Swift supports for training can be used on A10 GPUs. Users can take advantage of the free GPU resources offered by ModelScope:
+1. Visit the [ModelScope](https://www.modelscope.cn) official website and log in.
+2. Click on `My Notebook` on the left and start a free GPU instance.
+3. Enjoy utilizing the A10 GPU resources.

ms-swift/docs/source_en/Instruction/Agent-support.md ADDED Viewed

	@@ -0,0 +1,215 @@

+# Agent Support
+## Dataset Format
+Example data samples for the pure text Agent and multimodal Agent are as follows:
+```jsonl
+{"tools": ["{\"type\": \"function\", \"function\": {\"name\": \"realtime_aqi\", \"description\": \"Weather forecast. Get real-time air quality, including current air quality, PM2.5, and PM10 information.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"city\": {\"type\": \"string\", \"description\": \"City name, e.g., Shanghai\"}}, \"required\": [\"city\"]}}}"], "messages": [{"role": "user", "content": "What is the weather like in Beijing and Shanghai today?"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"Beijing\"}}"}, {"role": "tool_call", "content": "{\"name\": \"realtime_aqi\", \"arguments\": {\"city\": \"Shanghai\"}}"}, {"role": "tool_response", "content": "{\"city\": \"Beijing\", \"aqi\": \"10\", \"unit\": \"celsius\"}"}, {"role": "tool_response", "content": "{\"city\": \"Shanghai\", \"aqi\": \"72\", \"unit\": \"fahrenheit\"}"}, {"role": "assistant", "content": "According to the weather forecast tool, the air quality index (AQI) in Beijing is 10, which indicates good air quality; whereas in Shanghai, the AQI is 72, indicating mild pollution."}]}
+{"tools": ["{\"type\": \"function\", \"function\": {\"name\": \"click\", \"description\": \"Click on a position on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"integer\", \"description\": \"X-coordinate representing the horizontal position on the screen\"}, \"y\": {\"type\": \"integer\", \"description\": \"Y-coordinate representing the vertical position on the screen\"}}, \"required\": [\"x\", \"y\"]}}}"], "messages": [{"role": "user", "content": "<image>What time is it now?"}, {"role": "assistant", "content": "<think>\nI can check the current time by opening the calendar app.\n</think>\n"}, {"role": "tool_call", "content": "{\"name\": \"click\", \"arguments\": {\"x\": 105, \"y\": 132}}"}, {"role": "tool_response", "content": "{\"images\": \"<image>\", \"status\": \"success\"}"}, {"role": "assistant", "content": "Successfully opened the calendar app. The current time is 11 o'clock in the morning."}], "images": ["desktop.png", "calendar.png"]}
+```
+- When the `agent_template` is set to "react_en", "hermes", etc., this format is compatible with training for all model Agents and allows easy switching between different models.
+- Here, `tools` is a `List[str]`, where each tool needs to be a JSON string. Additionally, the `content` part of the messages where the role is `'tool_call'` or `'tool_response/tool'` must also be in JSON string format.
+- The `tools` field will be combined with the `{"role": "system", ...}` section during training/inference according to the `agent_template`, forming a complete system section.
+- The `{"role": "tool_call", ...}` part will automatically be converted into corresponding formats of `{"role": "assistant", ...}` based on the `agent_template`. Multiple consecutive `{"role": "assistant", ...}` entries will be concatenated to form a complete assistant_content.
+- The `{"role": "tool_response", ...}` can also be written as `{"role": "tool", ...}`, these two forms are equivalent. This part will also be automatically converted according to the `agent_template`. During training, this part does not participate in loss calculations, similar to `{"role": "user", ...}`.
+- This format supports parallel tool calls; refer to the first data sample for an example. In multimodal Agent data samples, the number of `<image>` tags should match the length of "images", and their positions indicate where the image features are inserted. It also supports other modalities, such as audios and videos.
+The following are the `input_ids` and `labels` after encoding the two data samples mentioned above using the templates for **qwen2_5** and **qwen2_5_vl** , with the selected `agent_template` being **hermes** :
+Sample One (Parallel Tool Calls):
+```text
+[INPUT_IDS] <|im_start|>system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
+# Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name": "realtime_aqi", "description": "Weather forecast. Get real-time air quality, including current air quality, PM2.5, and PM10 information.", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "City name, e.g., Shanghai"}}, "required": ["city"]}}}
+</tools>
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call><|im_end|>
+<|im_start|>user
+What is the weather like in Beijing and Shanghai today?<|im_end|>
+<|im_start|>assistant
+<tool_call>
+{"name": "realtime_aqi", "arguments": {"city": "Beijing"}}
+</tool_call>
+<tool_call>
+{"name": "realtime_aqi", "arguments": {"city": "Shanghai"}}
+</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+{"city": "Beijing", "aqi": "10", "unit": "celsius"}
+</tool_response>
+<tool_response>
+{"city": "Shanghai", "aqi": "72", "unit": "fahrenheit"}
+</tool_response><|im_end|>
+<|im_start|>assistant
+According to the weather forecast tool, the air quality index (AQI) in Beijing is 10, which indicates good air quality; whereas in Shanghai, the AQI is 72, indicating mild pollution.<|im_end|>
+[LABELS] [-100 * 195]<tool_call>
+{"name": "realtime_aqi", "arguments": {"city": "Beijing"}}
+</tool_call>
+<tool_call>
+{"name": "realtime_aqi", "arguments": {"city": "Shanghai"}}
+</tool_call><|im_end|>[-100 * 67]According to the weather forecast tool, the air quality index (AQI) in Beijing is 10, which indicates good air quality; whereas in Shanghai, the AQI is 72, indicating mild pollution.<|im_end|>
+```
+Sample Two (Multimodal, Mixed Assistant and Tool Call):
+```text
+[INPUT_IDS] <|im_start|>system
+You are a helpful assistant.
+# Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name": "click", "description": "Click on a position on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "integer", "description": "X-coordinate representing the horizontal position on the screen"}, "y": {"type": "integer", "description": "Y-coordinate representing the vertical position on the screen"}}, "required": ["x", "y"]}}}
+</tools>
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call><|im_end|>
+<|im_start|>user
+<|vision_start|>[151655 * 729]<|vision_end|>What time is it now?<|im_end|>
+<|im_start|>assistant
+<think>
+I can check the current time by opening the calendar app.
+</think>
+<tool_call>
+{"name": "click", "arguments": {"x": 105, "y": 132}}
+</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+{"images": "<|vision_start|>[151655 * 729]<|vision_end|>", "status": "success"}
+</tool_response><|im_end|>
+<|im_start|>assistant
+Successfully opened the calendar app. The current time is 11 o'clock in the morning.<|im_end|>
+[LABELS] [-100 * 924]<think>
+I can check the current time by opening the calendar app.
+</think>
+<tool_call>
+{"name": "click", "arguments": {"x": 105, "y": 132}}
+</tool_call><|im_end|>[-100 * 759]Successfully opened the calendar app. The current time is 11 o'clock in the morning.<|im_end|>
+```
+**react_en** is one of the commonly used agent template formats. Below is an example of the `input_ids` and `labels` after encoding by qwen2_5 using `agent_template='react_en'`:
+```text
+[INPUT_IDS] <|im_start|>system
+Answer the following questions as best you can. You have access to the following tools:
+realtime_aqi: Call this tool to interact with the realtime_aqi API. What is the realtime_aqi API useful for? Weather forecast. Get real-time air quality, including current air quality, PM2.5, and PM10 information. Parameters: {"type": "object", "properties": {"city": {"type": "string", "description": "City name, e.g., Shanghai"}}, "required": ["city"]} Format the arguments as a JSON object.
+Use the following format:
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action: the action to take, should be one of [realtime_aqi]
+Action Input: the input to the action
+Observation: the result of the action
+... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+Begin!
+<|im_end|>
+<|im_start|>user
+What is the weather like in Beijing and Shanghai today?<|im_end|>
+<|im_start|>assistant
+Action: realtime_aqi
+Action Input: {'city': 'Beijing'}
+Action: realtime_aqi
+Action Input: {'city': 'Shanghai'}
+Observation:{"city": "Beijing", "aqi": "10", "unit": "celsius"}
+Observation:{"city": "Shanghai", "aqi": "72", "unit": "fahrenheit"}
+According to the weather forecast tool, the air quality index (AQI) in Beijing is 10, which indicates good air quality; whereas in Shanghai, the AQI is 72, indicating mild pollution.<|im_end|>
+[LABELS] [-100 * 233]Action: realtime_aqi
+Action Input: {'city': 'Beijing'}
+Action: realtime_aqi
+Action Input: {'city': 'Shanghai'}
+Observation:[-100 * 45]According to the weather forecast tool, the air quality index (AQI) in Beijing is 10, which indicates good air quality; whereas in Shanghai, the AQI is 72, indicating mild pollution.<|im_end|>
+```
+The following code can be used to experiment with more models and `agent_template` options. For more selectable values of `agent_template`, refer to [here](https://github.com/modelscope/swift/blob/main/swift/plugin/agent_template/__init__.py).
+```python
+from swift.llm import get_model_tokenizer, get_template
+_, tokenizer = get_model_tokenizer('ZhipuAI/GLM-4-9B-0414', load_model=False)
+template = get_template(tokenizer.model_meta.template, tokenizer, agent_template='hermes')
+data = {...}
+template.set_mode('train')
+encoded = template.encode(data)
+print(f'[INPUT_IDS] {template.safe_decode(encoded["input_ids"])}\n')
+print(f'[LABELS] {template.safe_decode(encoded["labels"])}')
+```
+## Tools Format
+The tools field provides information about the APIs that the model can call. You need to provide the name, description, and parameters of the tools, as shown in the example below:
+```python
+tools = [{
+    'type': 'function',
+    'function': {
+        'name': 'get_current_weather',
+        'description': 'Get the current weather in a given location',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The city and state, e.g. San Francisco, CA'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': ['celsius', 'fahrenheit']
+                }
+            },
+            'required': ['location']
+        }
+    }
+}]
+```
+## Usage of loss_scale
+`loss_scale` can be used to adjust the training loss weight for the model's output section. For example, in the ReACT format, you can set `--loss_scale react` (the loss_scale configuration file is written [here](https://github.com/modelscope/swift/blob/main/swift/plugin/loss_scale/config/react.json)). The role of this parameter is as follows:
+- The weight for the 'Thought:' and 'Final Answer:' sections is 1.
+- The weight for the 'Action:' and 'Action Input:' sections is 2.
+- The weight for the 'Observation:' field itself is 2.
+- The weight for the tool invocation results following the 'Observation:' field is 0.
+For the detailed design of the `loss_scale` plugin, please refer to the [Plugin-based Architecture](../Customization/Pluginization.md)documentation.
+## Training
+- Train the Agent capabilities of Base models by switching different models through modifying `--model`. Refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/agent/qwen2_5.sh).
+- The agent_template for training GLM4 is hermes. Refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/agent/glm4.sh).
+- Use `--loss_scale` to adjust the loss weight of the model output section. Refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/agent/loss_scale).
+## Inference
+- 🚀For inference of the original model or fully trained model, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_agent.py).
+- For inference after LoRA training, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/agent/loss_scale/infer.md).
+## Deployment
+For server and client code, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/agent).

ms-swift/docs/source_en/Instruction/Command-line-parameters.md ADDED Viewed

	@@ -0,0 +1,675 @@

+# Command Line Parameters
+The introduction to command line parameters will cover base arguments, atomic arguments, and integrated arguments, and specific model arguments. The final list of arguments used in the command line is the integration arguments. Integrated arguments inherit from basic arguments and some atomic arguments. Specific model arguments are designed for specific models and can be set using `--model_kwargs'` or the environment variable. The introduction to the Megatron-SWIFT command-line arguments can be found in the [Megatron-SWIFT Training Documentation](./Megatron-SWIFT-Training.md).
+Hints:
+- For passing a list in the command line, you can separate items with spaces. For example: `--dataset <dataset_path1> <dataset_path2>`.
+- For passing a dict in the command line, use JSON format. For example: `--model_kwargs '{"fps_max_frames": 12}'`.
+- Parameters marked with 🔥 are important. New users familiarizing themselves with ms-swift can focus on these command line parameters first.
+## Base Arguments
+- 🔥tuner_backend: Options are 'peft', 'unsloth'. Default is 'peft'.
+- 🔥train_type: Options are: 'lora', 'full', 'longlora', 'adalora', 'llamapro', 'adapter', 'vera', 'boft', 'fourierft', 'reft'. Default is 'lora'.
+- 🔥adapters: A list used to specify the id/path of the adapter. Default is `[]`.
+- external_plugins: A list of external plugin py files which will be registered into the plugin mappings，please check [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_rm.sh).
+- seed: Default is 42.
+- model_kwargs: Additional parameters specific to the model that can be passed in. This list of parameters will log a message during training and inference for reference. For example, `--model_kwargs '{"fps_max_frames": 12}'`.
+- load_args: When specifying `--resume_from_checkpoint`, `--model`, or `--adapters`, it will read the `args.json` file saved in the checkpoint, assigning values to the default None `basic arguments` (excluding data and generation arguments) which can be overridden by manually passing them in. The default is True for inference and export, and False for training.
+- load_data_args: If this parameter is set to True, additional data parameters will be read from args.json. The default is False.
+- use_hf: Controls whether ModelScope or HuggingFace is used for model and dataset downloads, and model pushing. Defaults to False, meaning ModelScope is used.
+- hub_token: Hub token. The hub token for ModelScope can be viewed [here](https://modelscope.cn/my/myaccesstoken).
+- custom_register_path: A list of paths to `.py` files for custom registration of models, dialogue templates, and datasets. Defaults to `[]`.
+### Model Arguments
+- 🔥model: Model ID or local path to the model. If it's a custom model, please use it with `model_type` and `template`. The specific details can be referred to in the [Custom Model](../Customization/Custom-model.md).
+- model_type: Model type. The same model architecture, template, and model loading process are defined as a model_type. The default is None, and it will be automatically selected based on the suffix of `--model` and the architectures attribute in config.json.
+- model_revision: Model revision, default is None.
+- task_type: The default value is 'causal_lm'. Optional values are 'causal_lm', 'seq_cls', and 'embedding'. Examples for seq_cls can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls), and examples for embedding can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/embedding).
+- 🔥torch_dtype: Data type of model weights, supports `float16`, `bfloat16`, `float32`. The default is None, and it is read from the 'config.json' file.
+- attn_impl: Type of attention, options are`flash_attn`, `sdpa`, `eager`. The default is sdpa; if not supported, eager is used.
+  - Note: These three implementations may not all be supported, depending on the support of the corresponding model.
+- num_labels: This parameter is required for classification models (i.e., `--task_type seq_cls`). It represents the number of labels, with a default value of None.
+- problem_type: This parameter is required for classification models (i.e., `--task_type seq_cls`). The options are 'regression', 'single_label_classification', and 'multi_label_classification'. The default value is None, and it will be automatically set based on the number of labels and the dataset type.
+- rope_scaling: Type of rope, supports `linear` and `dynamic`, should be used in conjunction with `max_length`. Default is None.
+- device_map: Device map configuration used by the model, such as 'auto', 'cpu', JSON string, or the path of a JSON file. The default is None, automatically set based on the device and distributed training conditions.
+- max_memory: When device_map is set to 'auto' or 'sequential', the model weights will be allocated to devices based on max_memory, for example: `--max_memory '{0: "20GB", 1: "20GB"}'`. The default value is None.
+- local_repo_path: Some models depend on a GitHub repo when loading. To avoid network issues during `git clone`, a local repo can be used directly. This parameter needs to be passed with the path to the local repo, with the default being `None`.
+- init_strategy: When loading the model, initialize all uninitialized parameters. Options values are 'zero', 'uniform', 'normal', 'xavier_uniform', 'xavier_normal', 'kaiming_uniform', 'kaiming_normal', 'orthogonal'. Default is None.
+### Data Arguments
+- 🔥dataset: A list of dataset IDs or paths. Default is `[]`. The input format for each dataset is: `dataset ID or dataset path:sub-dataset#sampling size`, where sub-dataset and sampling data are optional. Local datasets support jsonl, csv, json, folders, etc. Open-source datasets can be cloned locally via git and used offline by passing the folder. For custom dataset formats, refer to [Custom Dataset](../Customization/Custom-dataset.md). You can pass in `--dataset <dataset1> <dataset2>` to use multiple datasets.
+  - Sub-dataset: This parameter is effective only when the dataset is an ID or folder. If a subset was specified during registration, and only one sub-dataset exists, the registered sub-dataset is selected by default; otherwise, it defaults to 'default'. You can use `/` to select multiple sub-datasets, e.g., `<dataset_id>:subset1/subset2`. You can also use 'all' to select all sub-datasets, e.g., `<dataset_id>:all`.
+  - Sampling Size: By default, the complete dataset is used. If the sampling size is less than the total number of data samples, samples are selected randomly without repetition. If the sampling size exceeds the total number of data samples, then `sampling size%total data samples` samples are randomly sampled additionally, and data samples are repetitively sampled `sampling size//total data samples` times. Note: Streaming datasets only perform sequential sampling. If `--dataset_shuffle false` is set, non-streaming datasets will also perform sequential sampling.
+- 🔥val_dataset: A list of validation set IDs or paths. Default is `[]`.
+- 🔥split_dataset_ratio: Ratio for splitting the training set and validation set when val_dataset is not specified, default is 0.01. Set to 0 if no validation set split is needed.
+- data_seed: Random seed for the dataset, default is 42.
+- 🔥dataset_num_proc: Number of processes for dataset preprocessing, default is 1.
+- 🔥load_from_cache_file: Whether to load the dataset from the cache, default is False.
+- dataset_shuffle: Whether to shuffle the dataset. Defaults to True.
+  - Note: The shuffling in CPT/SFT consists of two parts: dataset shuffling, controlled by `dataset_shuffle`; and shuffling in the train_dataloader, controlled by `train_dataloader_shuffle`.
+- val_dataset_shuffle: Whether to perform shuffling on the val_dataset. Default is False.
+- 🔥streaming: Stream reading and processing of the dataset, default is False. It is typically set to True when handling large datasets.
+  - Note: You need to set `--max_steps` explicitly, as the streaming dataset does not have a defined length. You can achieve training equivalent to `--num_train_epochs` by setting `--save_strategy epoch` and specifying a sufficiently large `max_steps`. Alternatively, you can set `max_epochs` to ensure training exits after the corresponding number of epochs, at which point the model weights will be validated and saved.
+- interleave_prob: Defaults to None. When combining multiple datasets, the `concatenate_datasets` function is used by default. If this parameter is set, the `interleave_datasets` function will be used instead. This parameter is typically used when combining streaming datasets and is passed to the `interleave_datasets` function.
+- stopping_strategy: Can be either "first_exhausted" or "all_exhausted", with the default being "first_exhausted". This parameter is passed to the `interleave_datasets` function.
+- shuffle_buffer_size: This parameter is used to specify the shuffle buffer size for streaming datasets. Defaults to 1000.
+- download_mode: Dataset download mode, including `reuse_dataset_if_exists` and `force_redownload`, default is reuse_dataset_if_exists.
+- columns: Used for column mapping of the dataset to ensure that the dataset conforms to the format that AutoPreprocessor can handle. For more details, see [here](../Customization/Custom-dataset.md). You can pass in a JSON string, for example: `'{"text1": "query", "text2": "response"}'`, with the default being None.
+- strict: If set to True, any row with an issue in the dataset will throw an error immediately, otherwise, erroneous data samples will be discarded. Default is False.
+- remove_unused_columns: Whether to remove unused columns in the dataset, defaults to True.
+- 🔥model_name: Only applicable to the self-cognition task and effective only on the `swift/self-cognition` dataset. It replaces the `{{NAME}}` placeholder in the dataset. Input the model's name in both Chinese and English, separated by a space, for example: `--model_name 小黄 'Xiao Huang'`. Default is None.
+- 🔥model_author: Only applicable to the self-cognition task and effective only on the `swift/self-cognition` dataset. It replaces the `{{AUTHOR}}` placeholder in the dataset. Input the model author's name in both Chinese and English, separated by a space, for example: `--model_author '魔搭' 'ModelScope'`. Default is None.
+- custom_dataset_info: The path to the JSON file for custom dataset registration. Refer to [Custom Dataset](../Customization/Custom-dataset.md). Default is `[]`.
+### Template Arguments
+- 🔥template: Type of dialogue template. Default is None, which automatically selects the corresponding model's template type.
+- 🔥system: Custom system field, can take a string or txt file path as input. Default is None, uses the default system of the template.
+  - Note: The system priority in the dataset is the highest, followed by `--system`, and finally the `default_system` defined in the template.
+- 🔥max_length: The maximum length of tokens for a single sample. Defaults to None, set to the maximum length of tokens supported by the model (max_model_len).
+  - Note: In the cases of PPO, GRPO, and inference, max_length represents max_prompt_length.
+- truncation_strategy: Strategy for handling single sample tokens that exceed `max_length`. Options are `delete`, `left`, and `right`, representing deletion, left-side truncation, and right-side truncation, respectively. The default is 'delete'.
+- 🔥max_pixels: The maximum number of pixels (H*W) for input images to a multimodal model. Images exceeding this limit will be scaled. Default is None, meaning no maximum pixel limit.
+- 🔥agent_template: Agent template, which determines how to convert the list of tools into a system, how to extract tool calls from the model's response, and specifies the template format for `{"role": "tool_call", "content": "xxx"}` and `{"role": "tool_response", "content": "xxx"}`. Optional values include "react_en", "hermes", "glm4", "qwen_en", "toolbench", etc. For more details, please check [here](https://github.com/modelscope/ms-swift/blob/main/swift/plugin/agent_template/__init__.py). The default value is None, meaning it will be selected based on the model type.
+- norm_bbox: Controls how to scale bounding boxes (bbox). Options are 'norm1000' and 'none'. 'norm1000' represents scaling bbox coordinates to one-thousandths, and 'none' means no scaling. Default is None, automatically selected based on the model.
+- response_prefix: The prefix character for the response, for example, setting the response_prefix to `'<think>\n'` for QwQ-32B. The default is None, and it is automatically set according to the model.
+  - Note: If you are training the deepseek-r1/qwq model with a dataset that does not include `<think>...</think>`, please pass `--response_prefix ''` additionally when inferring after training.
+- padding_side: Padding side when `batch_size>=2` during training. Options are 'left' and 'right', with 'right' as the default. (For inference with batch_size>=2, only left padding is applied.)
+- loss_scale: Weight setting for the loss of training tokens. Default is `'default'`, which means that all responses (including history) are used with a weight of 1 in cross-entropy loss, and the loss from the corresponding `tool_response` in the agent_template is ignored. Possible values include: 'default', 'last_round', 'all', 'ignore_empty_think', and agent-specific options: 'react', 'hermes', 'qwen', 'agentflan', 'alpha_umi'. For more details about the agent part, please refer to [Pluginization](../Customization/Pluginization.md) and [Agent Training](./Agent-support.md).
+  - 'last_round': Only calculate the loss for the last round of response.
+  - 'all': Calculate the loss for all tokens.
+  - 'ignore_empty_think': On top of 'default', ignore the loss calculation for empty `'<think>\n\n</think>\n\n'`. See [this issue](https://github.com/modelscope/ms-swift/issues/4030) for more details.
+  - `'react'`, `'hermes'`, `'qwen'`: On top of `'default'`, set the loss weight of the `tool_call` part to 2.
+- sequence_parallel_size: Sequence parallelism size, default is 1. Currently supported in pt/sft/dpo. The training script refers to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh).
+- use_chat_template: Use chat template or generation template, default is `True`. `swift pt` is automatically set to the generation template.
+- template_backend: Selection of the template backend. Options are 'swift' and 'jinja', with 'swift' as the default. If using jinja, it applies transformer's `apply_chat_template`.
+  - Note: The jinja template backend supports only inference, not training.
+### Generation Arguments
+Refer to the [generation_config](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig) documentation.
+- 🔥max_new_tokens: The maximum number of new tokens generated during inference. Defaults to None, meaning unlimited.
+- temperature: The temperature parameter. Defaults to None and is read from generation_config.json.
+  - Note: The do_sample parameter has been removed in this version. Set the temperature to 0 to achieve the same effect.
+- top_k: The top_k parameter, defaults to None. It is read from generation_config.json.
+- top_p: The top_p parameter, defaults to None. It is read from generation_config.json.
+- repetition_penalty: The repetition penalty. Defaults to None and is read from generation_config.json.
+- num_beams: The number of beams reserved for parallel beam search, default is 1.
+- 🔥stream: Stream output, default is `False`.
+- stop_words: Additional stop words beyond eos_token, default is`[]`.
+  - Note: eos_token will be removed in the output response, whereas additional stop words will be retained in the output.
+- logprobs: Whether to output logprobs, default is False.
+- top_logprobs: The number of top_logprobs to output, defaults to None.
+### Quantization Arguments
+The following are the parameters for quantization when loading a model. For detailed meanings, you can refer to the [quantization](https://huggingface.co/docs/transformers/main/en/main_classes/quantization) documentation. Note that this does not include `gptq` and `awq` quantization parameters involved in `swift export`.
+- 🔥quant_method: The quantization method used when loading the model. Options are `bnb`, `hqq`, `eetq`.
+- 🔥quant_bits: Number of bits for quantization, default is None.
+- hqq_axis: HQQ quantization axis, default is None.
+- bnb_4bit_compute_dtype: The computation type for bnb quantization. Options are `float16`, `bfloat16`, `float32`. The default is None, which sets it to `torch_dtype`.
+- bnb_4bit_quant_type: BNB quantization type, supports `fp4` and `nf4`, default is `nf4`.
+- bnb_4bit_use_double_quant: Whether to use double quantization, default is `True`.
+- bnb_4bit_quant_storage: BNB quantization storage type, default is None.
+## Atomic Arguments
+### Seq2SeqTrainer Arguments
+This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with default values overridden by ms-swift. For unlisted items, refer to the [HF Official Documentation](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments).
+- 🔥output_dir: Defaults to None, set as `output/<model_name>`.
+- 🔥gradient_checkpointing: Whether to use gradient checkpointing, default is True.
+- 🔥deepspeed: Defaults to None. It can be set to 'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration file of ms-swift.
+- zero_hpz_partition_size: Default is `None`. This parameter is a feature of `ZeRO++`, which implements model sharding within nodes and data sharding between nodes. If you encounter grad_norm `NaN` issues, please try using `--torch_dtype float16`
+- 🔥per_device_train_batch_size: Default is 1.
+- 🔥per_device_eval_batch_size: Default is 1.
+- 🔥gradient_accumulation_steps: Gradient accumulation, default is None, meaning set gradient_accumulation_steps such that total_batch_size >= 16. The total_batch_size equals `per_device_train_batch_size * gradient_accumulation_steps * world_size`.
+- weight_decay: Weight decay coefficient, default value is 0.1.
+- adam_beta2: Default is 0.95.
+- 🔥learning_rate: Learning rate, defaults to 1e-5 for full parameters, and 1e-4 for LoRA and other tuners.
+- lr_scheduler_type: Type of lr_scheduler, defaults to 'cosine'.
+- lr_scheduler_kwargs: Other parameters for the lr_scheduler, defaults to None.
+- 🔥gradient_checkpointing_kwargs: Parameters for `torch.utils.checkpoint`. For example, set as `--gradient_checkpointing_kwargs '{"use_reentrant": false}'`. Defaults to None.
+- full_determinism: Ensures reproducible results during training. Note: This will negatively impact performance. Defaults to False.
+- 🔥report_to: Default value is `tensorboard`. You can also specify `--report_to tensorboard wandb swanlab` or `--report_to all`.
+- logging_first_step: Whether to log the first step, defaults to True.
+- logging_steps: Interval for logging, defaults to 5.
+- predict_with_generate: Whether to use generative method during validation, default is False.
+- metric_for_best_model: Default is None, which means that when predict_with_generate is set to False, it is set to 'loss'; otherwise, it is set to 'rouge-l' (during PPO training, the default value is not set; in GRPO training, it is set to 'reward').
+- greater_is_better: Defaults to None, which sets it to False when `metric_for_best_model` contains 'loss', otherwise sets to True.
+- max_epochs: Forces the training to exit after reaching `max_epochs`, and performs validation and saving of the model weights. This parameter is especially useful when using a streaming dataset. Default is None.
+Other important parameters:
+- 🔥num_train_epochs: Number of training epochs, default is 3.
+- 🔥save_strategy: Strategy for saving the model, options include 'no', 'steps', 'epoch'. Default is 'steps'.
+- 🔥save_steps: Default is 500.
+- 🔥eval_strategy: Evaluation strategy. Default is None and follows the strategy of `save_strategy`.
+- 🔥eval_steps: Default is None. If there is an evaluation dataset, it follows the strategy of `save_steps`.
+- 🔥save_total_limit: Maximum number of checkpoints to save. Older checkpoints will be deleted. Default is None, saving all checkpoints.
+- max_steps: Maximum number of training steps. Should be set when the dataset is streamed. Default is -1.
+- 🔥warmup_ratio: Default is 0.
+- save_on_each_node: Default is False. Should be considered in multi-node training.
+- save_only_model: Whether to save only the model weights without including optimizer state, random seed state, etc. Default is False.
+- 🔥resume_from_checkpoint: Parameter for resuming training from a checkpoint, pass the checkpoint path. Default is None.
+  - Note: `resume_from_checkpoint` will load the model weights, optimizer weights, and random seed, and continue training from the last trained steps. You can specify `--resume_only_model` to load only the model weights.
+- 🔥ddp_backend: Options include "nccl", "gloo", "mpi", "ccl", "hccl", "cncl", and "mccl". Default is None, which allows for automatic selection.
+- 🔥ddp_find_unused_parameters: Default is None.
+- ddp_timeout: The default value is 1800, with the unit being seconds.
+- 🔥dataloader_num_workers: Defaults to None. If the platform is Windows, it is set to 0; otherwise, it is set to 1.
+- dataloader_pin_memory: Default is True.
+- dataloader_persistent_workers: Default is False.
+- dataloader_prefetch_factor: Defaults to None. If `dataloader_num_workers > 0`, it is set to 10.
+- train_dataloader_shuffle: Specifies whether the dataloader for CPT/SFT training is shuffled, with the default set to True. This parameter is not applicable to IterableDataset, as IterableDataset reads in a sequential manner.
+- 🔥neftune_noise_alpha: Coefficient of noise added by neftune, default is 0. Usually can be set to 5, 10, 15.
+- 🔥use_liger_kernel: Whether to enable the [Liger](https://github.com/linkedin/Liger-Kernel) kernel to accelerate training and reduce GPU memory consumption. Defaults to False. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger).
+- average_tokens_across_devices: Whether to average the number of tokens across devices. If set to True, `num_tokens_in_batch` will be synchronized using all_reduce for accurate loss calculation. Default is False.
+- max_grad_norm: Gradient clipping. Default is 1.
+- push_to_hub: Push checkpoint to hub. Default is False.
+- hub_model_id: Default is None.
+- hub_private_repo: Default is False.
+### Tuner Arguments
+- 🔥freeze_llm: This parameter is only effective for multimodal models and can be used for full parameter training and LoRA, but with different meanings. In full parameter training, setting freeze_llm to True will freeze some of the LLM weights. In LoRA training, if `target_modules` is set to 'all-linear', setting freeze_llm to True will prevent adding LoRA modules to the LLM part. The default is False.
+- 🔥freeze_vit: This parameter is only effective for multimodal models and can be used for full parameter training and LoRA, with similar meanings as `freeze_llm`. The default is True.
+- 🔥freeze_aligner: This parameter is only effective for multimodal models and can be used for full parameter training and LoRA, with similar meanings as `freeze_llm`. The default is True.
+- 🔥target_modules: Specifies LoRA modules, with a default of `all-linear`. Its behavior differs in LLM and multimodal LLM. For LLM, it automatically finds all linear modules except lm_head and adds a tuner. For multimodal LLM, by default, it only adds a tuner to the LLM part, and this behavior can be controlled by `freeze_llm`, `freeze_vit`, and `freeze_aligner`. This parameter is not limited to LoRA and can be used for other tuners.
+- 🔥target_regex: Specifies a regex expression for LoRA modules, with a default of `None`. If this value is provided, the target_modules parameter becomes ineffective. This parameter is not limited to LoRA and can be used for other tuners.
+- init_weights: Specifies the method for initializing weights. LoRA can specify `true`, `false`, `gaussian`, `pissa`, `pissa_niter_[number of iters]`. Bone can specify `true`, `false`, `bat`. The default is `true`.
+- 🔥modules_to_save: After attaching a tuner, explicitly specifies additional original model modules to participate in training and storage. The default is `[]`. This parameter is not limited to LoRA and can be used for other tuners.
+#### Full Arguments
+- freeze_parameters: Prefix of the parameters to be frozen, default is `[]`.
+- freeze_parameters_regex: Regex for matching the parameters to be frozen，default is None.
+- freeze_parameters_ratio: Ratio of parameters to freeze from bottom to top, default is 0. It can be set to 1 to freeze all parameters, and trainable parameters can be set in conjunction with this.
+- trainable_parameters: Prefix of additional trainable parameters, default is `[]`.
+- trainable_parameters_regex: Regex for matching additional trainable parameters, default is None.
+  - Note: `trainable_parameters`, `trainable_parameters_regex` takes precedence over `freeze_parameters`, `freeze_parameters_regex` and `freeze_parameters_ratio`. When full parameter training is specified, all modules are set to trainable, then some parameters are frozen according to `freeze_parameters`, `freeze_parameters_regex` and `freeze_parameters_ratio`, and finally, some parameters are reopened for training according to `trainable_parameters`,`trainable_parameters_regex`.
+#### LoRA
+- 🔥lora_rank: Default is `8`.
+- 🔥lora_alpha: Default is `32`.
+- lora_dropout: Default is `0.05`.
+- lora_bias: Defaults to `'none'`. Possible values are 'none', 'all'. If you want to make all biases trainable, you can set it to `'all'`.
+- lora_dtype: Specifies the dtype type for the LoRA modules. Supported types are 'float16', 'bfloat16', 'float32'. The default is None, which follows the original model type.
+- 🔥use_dora: Defaults to `False`, indicating whether to use `DoRA`.
+- use_rslora: Defaults to `False`, indicating whether to use `RS-LoRA`.
+- 🔥lorap_lr_ratio: LoRA+ parameter, default value `None`, recommended values `10~16`. Specify this parameter when using LoRA to enable LoRA+.
+##### LoRA-GA
+- lora_ga_batch_size: The default value is `2`. The batch size used for estimating gradients during initialization in LoRA-GA.
+- lora_ga_iters: The default value is `2`. The number of iterations for estimating gradients during initialization in LoRA-GA.
+- lora_ga_max_length: The default value is `1024`. The maximum input length for estimating gradients during initialization in LoRA-GA.
+- lora_ga_direction: The default value is `ArB2r`. The initial direction used for gradient estimation during initialization in LoRA-GA. Allowed values are: `ArBr`, `A2rBr`, `ArB2r`, and `random`.
+- lora_ga_scale: The default value is `stable`. The scaling method for initialization in LoRA-GA. Allowed values are: `gd`, `unit`, `stable`, and `weightS`.
+- lora_ga_stable_gamma: The default value is `16`. The gamma value when choosing `stable` scaling for initialization.
+#### FourierFt
+FourierFt uses the three parameters `target_modules`, `target_regex`, and `modules_to_save`.
+- fourier_n_frequency: Number of frequencies in Fourier transform, an `int`, similar to `r` in LoRA. Default value is `2000`.
+- fourier_scaling: Scaling value of matrix W, a `float`, similar to `lora_alpha` in LoRA. Default value is `300.0`.
+#### BOFT
+BOFT uses the three parameters `target_modules`, `target_regex`, and `modules_to_save`.
+- boft_block_size: Size of BOFT blocks, default value is 4.
+- boft_block_num: Number of BOFT blocks, cannot be used simultaneously with `boft_block_size`.
+- boft_dropout: Dropout value for BOFT, default is 0.0.
+#### Vera
+Vera uses the three parameters `target_modules`, `target_regex`, and `modules_to_save`.
+- vera_rank: Size of Vera Attention, default value is 256.
+- vera_projection_prng_key: Whether to store the Vera mapping matrix, default is True.
+- vera_dropout: Dropout value for Vera, default is `0.0`.
+- vera_d_initial: Initial value of Vera's d matrix, default is `0.1`.
+#### GaLore
+- 🔥use_galore: Default value is False, whether to use GaLore.
+- galore_target_modules: Default is None, if not provided, applies GaLore to attention and MLP.
+- galore_rank: Default value is 128, GaLore rank value.
+- galore_update_proj_gap: Default is 50, interval for updating decomposed matrices.
+- galore_scale: Default is 1.0, matrix weight coefficient.
+- galore_proj_type: Default is `std`, type of GaLore matrix decomposition.
+- galore_optim_per_parameter: Default value is False, whether to set a separate optimizer for each Galore target parameter.
+- galore_with_embedding: Default value is False, whether to apply GaLore to embedding.
+- galore_quantization: Whether to use q-galore, default is `False`.
+- galore_proj_quant: Whether to quantize the SVD decomposition matrix, default is `False`.
+- galore_proj_bits: Number of bits for SVD quantization.
+- galore_proj_group_size: Number of groups for SVD quantization.
+- galore_cos_threshold: Cosine similarity threshold for updating projection matrices. Default value is 0.4.
+- galore_gamma_proj: As the projection matrix becomes more similar over time, this parameter is the coefficient for extending the update interval. Default value is 2.
+- galore_queue_size: Length of the queue for calculating projection matrix similarity, default is 5.
+#### LISA
+Note: LISA only supports full parameters, i.e., `--train_type full`.
+- 🔥lisa_activated_layers: Default value is `0`, representing LISA is not used. Setting to a non-zero value activates that many layers, it is recommended to set to 2 or 8.
+- lisa_step_interval: Default value is `20`, number of iter to switch to layers that can be backpropagated.
+#### UNSLOTH
+🔥Unsloth has no new parameters; adjusting existing ones will suffice to support it:
+```
+--tuner_backend unsloth
+--train_type full/lora
+--quant_bits 4
+```
+#### LLAMAPRO
+- 🔥llamapro_num_new_blocks: Default value is `4`, total number of new layers to insert.
+- llamapro_num_groups: Default value is `None`, number of groups to insert new blocks. If `None`, it equals `llamapro_num_new_blocks`, meaning each new layer is inserted separately into the original model.
+#### AdaLoRA
+When the `train_type` parameter is set to `adalora`, the following parameters take effect. The `adalora` parameters such as `target_modules` inherit from the corresponding parameters of `lora`, but the `lora_dtype` parameter does not take effect.
+- adalora_target_r: Default value is `8`, average rank of AdaLoRA.
+- adalora_init_r: Default value is `12`, initial rank of AdaLoRA.
+- adalora_tinit: Default value is `0`, initial warmup of AdaLoRA.
+- adalora_tfinal: Default value is `0`, final warmup of AdaLoRA.
+- adalora_deltaT: Default value is `1`, step interval of AdaLoRA.
+- adalora_beta1: Default value is `0.85`, EMA parameter of AdaLoRA.
+- adalora_beta2: Default value is `0.85`, EMA parameter of AdaLoRA.
+- adalora_orth_reg_weight: Default value is `0.5`, regularization parameter for AdaLoRA.
+#### ReFT
+The following parameters are effective when `train_type` is set to `reft`.
+> 1. ReFT cannot merge tuners.
+> 2. ReFT is not compatible with gradient checkpointing.
+> 3. If experiencing issues while using DeepSpeed, please uninstall DeepSpeed temporarily.
+- 🔥reft_layers: Which layers ReFT is applied to, default is `None`, representing all layers. You can input a list of layer numbers, e.g., `reft_layers 1 2 3 4`.
+- 🔥reft_rank: Rank of ReFT matrix, default is `4`.
+- reft_intervention_type: Type of ReFT, supports 'NoreftIntervention', 'LoreftIntervention', 'ConsreftIntervention', 'LobireftIntervention', 'DireftIntervention', 'NodireftIntervention', default is `LoreftIntervention`.
+- reft_args: Other supported parameters for ReFT Intervention, input in json-string format.
+### LMDeploy Arguments
+Parameter meanings can be found in the [lmdeploy documentation](https://lmdeploy.readthedocs.io/en/latest/api/pipeline.html#turbomindengineconfig).
+- 🔥tp: tensor parallelism degree. Default is `1`.
+- session_len: Default is `None`.
+- cache_max_entry_count: Default is `0.8`.
+- quant_policy: Default is `0`.
+- vision_batch_size: Default is `1`.
+### vLLM Arguments
+Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai/en/latest/serving/engine_args.html).
+- 🔥gpu_memory_utilization: Default value is `0.9`.
+- 🔥tensor_parallel_size: Default is `1`.
+- pipeline_parallel_size: Default is `1`.
+- max_num_seqs: Default is `256`.
+- 🔥max_model_len: Default is `None`.
+- disable_custom_all_reduce: Default is `False`.
+- enforce_eager: Determines whether vllm uses PyTorch eager mode or constructs a CUDA graph, default is `False`. Setting it to True can save memory but may affect efficiency.
+- 🔥limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--limit_mm_per_prompt '{"image": 5, "video": 2}'`.
+- vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora.
+- - vllm_quantization: vllm is able to quantize model with this argument，supported values can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html).
+- enable_prefix_caching: Enable the automatic prefix caching of vllm to save processing time for querying repeated prefixes. The default is `False`.
+### Merge Arguments
+- 🔥merge_lora: Indicates whether to merge lora; this parameter supports lora, llamapro, and longlora, default is `False`. Example parameters [here](https://github.com/modelscope/ms-swift/blob/main/examples/export/merge_lora.sh).
+- safe_serialization: Whether to store safetensors, default is True.
+- max_shard_size: Maximum size of a single storage file, default is '5GB'.
+## Integration Arguments
+### Training Arguments
+Training arguments include the [base arguments](#base-arguments), [Seq2SeqTrainer arguments](#Seq2SeqTrainer-arguments), [tuner arguments](#tuner-arguments), and also include the following parts:
+- add_version: Add directory to output_dir with `'<version>-<timestamp>'` to prevent weight overwrite, default is True.
+- resume_only_model: Defaults to False. If set to True in conjunction with `resume_from_checkpoint`, only the model weights are resumed.
+- check_model: Check local model files for corruption or modification and give a prompt, default is True. If in an offline environment, please set to False.
+- 🔥create_checkpoint_symlink: Creates additional checkpoint symlinks to facilitate writing automated training scripts. The symlink paths for `best_model` and `last_model` are `f'{output_dir}/best'` and `f'{output_dir}/last'` respectively.
+- loss_type: Type of loss. Defaults to None, which uses the model's built-in loss function.
+- 🔥packing: Whether to use sequence packing to improve computational efficiency. The default value is False.
+  - Note: When using packing, please combine it with `--attn_impl flash_attn` and ensure "transformers>=4.44". For details, see [this PR](https://github.com/huggingface/transformers/pull/31629).
+- 🔥lazy_tokenize: Whether to use lazy tokenization. If set to False, all dataset samples are tokenized before training (for multimodal models, this includes reading images from disk). This parameter defaults to False for LLM training, and True for MLLM training, to save memory.
+- acc_strategy: Strategy for calculating accuracy during training and validation. Options are `seq`-level and `token`-level accuracy, with `token` as the default.
+- max_new_tokens: Generation parameter override. The maximum number of tokens to generate when `predict_with_generate=True`, defaulting to 64.
+- temperature: Generation parameter override. The temperature setting when `predict_with_generate=True`, defaulting to 0.
+- optimizer: Custom optimizer name for the plugin, defaults to None.
+- metric: Custom metric name for the plugin. Defaults to None, with the default set to 'acc' when `predict_with_generate=False` and 'nlg' when `predict_with_generate=True`.
+- eval_use_evalscope: Whether to use evalscope for evaluation, this parameter needs to be set to enable evaluation, refer to [example](../Instruction/Evaluation.md#evaluation-during-training). Default is False.
+- eval_datasets: Evaluation datasets, multiple datasets can be set, separated by spaces
+- eval_datasets_args: Evaluation dataset parameters in JSON format, parameters for multiple datasets can be set
+- eval_limit: Number of samples from the evaluation dataset
+- eval_generation_config: Model inference configuration during evaluation, in JSON format, default is `{'max_tokens': 512}`
+### RLHF Arguments
+RLHF arguments inherit from the [training arguments](#training-arguments).
+- 🔥rlhf_type: Type of human alignment algorithm, supporting `dpo`, `orpo`, `simpo`, `kto`, `cpo`, `rm`, `ppo` and `grpo`. Default is 'dpo'.
+- ref_model: Required for full parameter training when using the dpo, kto, ppo or grpo algorithms. Default is None.
+- ref_model_type: Same as model_type. Default is None.
+- ref_model_revision: Same as model_revision. Default is None.
+- 🔥beta: Coefficient for the KL regularization term. Default is `None`, meaning `simpo` algorithm defaults to `2.`, `grpo` algorithm defaults to `0.04`, and other algorithms default to `0.1`. For more details, refer to the [documentation](./RLHF.md).
+- label_smoothing: Whether to use DPO smoothing, default value is `0`.
+- 🔥rpo_alpha: The weight of sft_loss added to DPO, default is `1`. The final loss is `KL_loss + rpo_alpha * sft_loss`.
+- cpo_alpha: Coefficient for nll loss in CPO/SimPO loss, default is `1.`.
+- simpo_gamma: Reward margin term in the SimPO algorithm, with a paper-suggested setting of 0.5-1.5, default is `1.`.
+- desirable_weight: Loss weight $\lambda_D$ for desirable response in the KTO algorithm, default is `1.`.
+- undesirable_weight: Loss weight $\lambda_U$ for undesirable response in the KTO algorithm, default is `1.`.
+- loss_scale: Override template arguments, default is 'last_round'.
+- temperature: Default is 0.9; this parameter will be used in PPO and GRPO.
+#### Reward Model Parameters
+The reward model parameters will be used in PPO and GRPO.
+- reward_model: Default is None.
+- reward_adapters: Default is `[]`.
+- reward_model_type: Default is None.
+- reward_model_revision: Default is None.
+#### PPO Arguments
+The meanings of the following parameters can be referenced [here](https://huggingface.co/docs/trl/main/ppo_trainer):
+- num_ppo_epochs: Defaults to 4
+- whiten_rewards: Defaults to False
+- kl_coef: Defaults to 0.05
+- cliprange: Defaults to 0.2
+- vf_coef: Defaults to 0.1
+- cliprange_value: Defaults to 0.2
+- gamma: Defaults to 1.0
+- lam: Defaults to 0.95
+- num_mini_batches: Defaults to 1
+- local_rollout_forward_batch_size: Defaults to 64
+- num_sample_generations: Defaults to 10
+- response_length: Defaults to 512
+- missing_eos_penalty: Defaults to None
+#### GRPO Arguments
+- per_device_train_batch_size: The training batch size per device. In GRPO, this refers to the batch size of completions during training.
+- per_device_eval_batch_size: The evaluation batch size per device. In GRPO, this refers to the batch size of completions during evaluation.
+- num_generations: The number of samples for each prompt, referred to as the G value in the paper, needs to be divisible by per_device_batch_size * - gradient_accumulation_steps * nproc_per_node, default is 8.
+- max_completion_length: The maximum generation length in the GRPO algorithm, default is 512.
+- ds3_gather_for_generation: This parameter applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible with vLLM generation. The default is True.
+- reward_funcs: Reward functions in the GRPO algorithm; options include `accuracy`,`format`,`cosine` and `repetition`, as seen in `swift/plugin/orm.py`. You can also customize your own reward functions in the plugin. Default is `[]`.
+- reward_weights: Weights for each reward function. The number should be equal to the sum of the number of reward functions and reward models. If `None`, all rewards are weighted equally with weight `1.0`.
+  - Note: If `--reward_model` is included in GRPO training, it is added to the end of the reward functions.
+- reward_model_plugin: The logic for the reward model, which defaults to ORM logic. For more information, please refer to [Customized Reward Models](./GRPO.md#customized-reward-models).
+- dataset_shuffle: Whether to shuffle the dataset randomly. Default is True.
+- loss_type: The type of loss normalization. Options are ['grpo', 'bnpo', 'dr_grpo'], default is 'grpo'. For details, see this [pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)
+- log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
+  - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
+- use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
+- num_infer_workers: The number of inference workers per node. This setting is only effective when using vLLM or lmdeploy.
+- vllm_device: Configures the devices for deploying vLLM. You can set it to auto, which will allocate the last few GPUs based on the value of num_infer_workers. Alternatively, specify a number of devices equal to num_infer_workers. For example: --vllm_device cuda:1 cuda:2.
+- vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
+- vllm_max_model_len: vLLM passthrough parameter, default is None.
+- vllm_max_num_seqs: vLLM passthrough parameter, default is 256.
+- vllm_enforce_eager: vLLM passthrough parameter, default is False.
+- vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
+- vllm_enable_prefix_caching: vLLM passthrough parameter, default is True.
+- vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
+- vllm_server_port: The service port of the vLLM server. Default is 8000.
+- vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+- top_k: Default is 50.
+- top_p: Default is 0.9.
+- repetition_penalty: Repetition penalty term. Default is 1.
+- num_iterations: number of iterations per batch. Default is 1.
+- epsilon: epsilon value for clipping. Default is 0.2.
+- epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
+- async_generate: Use async rollout to improve train speed，default `false`.
+- sleep_level: vllm specific，when both actor and rollout in the same GPU，you can make vllm sleep when model is training.
+- move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
+- offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
+- offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
+  - Note: If this parameter is set to True and the grad_norm remains zero during training, please install vllm==0.7.3.
+- gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
+- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
+- dynamic_sample: Exclude data within the group where the reward standard deviation is 0, and additionally sample new data. Default is False.
+- max_resample_times: Under the dynamic_sample setting, limit the number of resampling attempts to a maximum of 3. Default is 3 times.
+- overlong_filter: Skip overlong truncated samples, which will not be included in loss calculation. Default is False.
+The hyperparameters for the reward function can be found in the [Built-in Reward Functions section](#built-in-reward-functions).
+cosine reward function arguments
+- cosine_min_len_value_wrong (default: 0.0): Reward value corresponding to the minimum length when the answer is incorrect. Default is 0.0
+- cosine_max_len_value_wrong (default: -0.5): Reward value corresponding to the maximum length when the answer is incorrect. Default is -0.5
+- cosine_min_len_value_correct (default: 1.0): Reward value corresponding to the minimum length when the answer is correct. Default is 1.0
+- cosine_max_len_value_correct (default: 0.5): Reward value corresponding to the maximum length when the answer is correct. Default is 0.5
+- cosine_max_len (default value equal to the model's maximum generation capacity): Maximum length limit for generated text. Default value equal to max_completion_length
+repetition penalty function arguments
+- repetition_n_grams (default: 3): Size of the n-gram used to detect repetition.
+- repetition_max_penalty (default: -1.0): Maximum penalty value, which controls the intensity of the penalty.
+Soft overlong reward parameters:
+- soft_max_length: L_max in the paper, the maximum generation length of the model, default is equal to max_completion_length.
+- soft_cache_length: L_cache in the paper, controls the length penalty interval, which is defined as [soft_max_length - soft_cache_length, soft_max_length].
+#### SWANLAB
+- **swanlab_token**: SwanLab's API key
+- **swanlab_project**: SwanLab's project, which needs to be created in advance on the page: [https://swanlab.cn/space/~](https://swanlab.cn/space/~)
+- **swanlab_workspace**: Defaults to `None`, will use the username associated with the API key
+- **swanlab_exp_name**: Experiment name, can be left empty. If empty, the value of `--output_dir` will be used by default
+- **swanlab_mode**: Optional values are `cloud` and `local`, representing cloud mode or local mode
+### Inference Arguments
+Inference arguments include the [base arguments](#base-arguments), [merge arguments](#merge-arguments), [vLLM arguments](#vllm-arguments), [LMDeploy arguments](#LMDeploy-arguments), and also contain the following:
+- 🔥infer_backend: Inference acceleration backend, supporting three inference engines: 'pt', 'vllm', and 'lmdeploy'. The default is 'pt'.
+- 🔥max_batch_size: Effective when infer_backend is set to 'pt'; used for batch inference, with a default value of 1.
+- ddp_backend: Effective when infer_backend is set to 'pt'; used to specify the distributed backend for multi-GPU inference. The default is None, which means automatic selection. For an example of multi-GPU inference, you can refer [here](https://github.com/modelscope/ms-swift/tree/main/examples/infer/pt).
+- 🔥result_path: Path to store inference results (jsonl). The default is None, meaning results are saved in the checkpoint directory (with args.json file) or './result' directory. The final storage path will be printed in the command line.
+- metric: Evaluate the results of the inference, currently supporting 'acc' and 'rouge'. The default is None, meaning no evaluation is performed.
+- val_dataset_sample: Number of samples from the inference dataset, default is None.
+### Deployment Arguments
+Deployment Arguments inherit from the [inference arguments](#inference-arguments).
+- host: Service host, default is '0.0.0.0'.
+- port: Port number, default is 8000.
+- api_key: The API key required for access; the default is None.
+- owned_by: Default is `swift`.
+- 🔥served_model_name: Model name for serving, defaults to the model's suffix.
+- verbose: Print detailed logs, with a default value of True.
+  - Note: In `swift app` or `swift eval`, the default is False.
+- log_interval: Interval for printing tokens/s statistics, default is 20 seconds. If set to -1, it will not be printed.
+- max_logprobs: Maximum number of logprobs returned to the client, with a default value of 20.
+- use_async_engine: Whether to use the async engine under the vLLM backend. Default is True.
+### Web-UI Arguments
+- server_name: Host for the web UI, default is '0.0.0.0'.
+- server_port: Port for the web UI, default is 7860.
+- share: Default is False.
+- lang: Language for the web UI, options are 'zh', 'en'. Default is 'zh'.
+### App Arguments
+App parameters inherit from [deployment arguments](#deployment-arguments) and [Web-UI Arguments](#web-ui-arguments).
+- base_url: The base URL for model deployment, for example, `http://localhost:8000/v1`. The default value is `None`, which means using local deployment.
+- studio_title: Title of the studio. Default is None, set to the model name.
+- is_multimodal: Whether to launch the multimodal version of the app. Defaults to None, automatically determined based on the model; if it cannot be determined, set to False.
+- lang: Overrides the Web-UI Arguments, default is 'en'.
+### Evaluation Arguments
+Evaluation Arguments inherit from the [deployment arguments](#deployment-arguments).
+- 🔥eval_backend: Evaluation backend, defaults to 'Native'. It can also be specified as 'OpenCompass' or 'VLMEvalKit'.
+- 🔥eval_dataset: Evaluation dataset, please refer to the [evaluation documentation](./评测.md).
+- eval_limit: Number of samples per evaluation set, defaults to None.
+- eval_output_dir: Directory to store evaluation results, defaults to 'eval_output'.
+- temperature: Override generation parameters, defaults to 0.
+- eval_num_proc: Maximum client concurrency during evaluation, defaults to 16.
+- eval_url: Evaluation URL, e.g., `http://localhost:8000/v1`. Examples can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/eval/eval_url). Defaults to None for local deployment evaluation.
+- eval_generation_config: Model inference configuration during evaluation, should be passed as a JSON string, e.g., `'{"max_new_tokens": 512}'`; defaults to None.
+- extra_eval_args: Additional evaluation parameters, should be passed as a JSON string, defaults to empty. Only effective for Native evaluation. For more parameter descriptions, please refer to [here](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
+- local_dataset: Some evaluation sets, such as `CMB`, require additional data packages to be downloaded for utilization. Setting this parameter to `true` will automatically download the full data package, create a `data` folder in the current directory, and start the evaluation. The data package will only be downloaded once, and future evaluations will use the cache. This parameter defaults to `false`.
+  - Note: By default, evaluation uses the dataset under `~/.cache/opencompass`. After specifying this parameter, it will directly use the data folder in the current directory.
+### Export Arguments
+Export Arguments include the [basic arguments](#base-arguments) and [merge arguments](#merge-arguments), and also contain the following:
+- 🔥output_dir: The path for storing exported results. The default value is None, and an appropriate suffix path will be automatically set.
+- exist_ok: If output_dir exists, do not raise an exception and overwrite the contents. The default value is False.
+- 🔥quant_method: Options are 'gptq', 'awq', or 'bnb', with the default being None. Examples can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize).
+- quant_n_samples: The number of samples for the validation set used by gptq/awq, with a default of 256.
+- max_length: Max length for the calibration set, default value is 2048.
+- quant_batch_size: Quantization batch size, default is 1.
+- group_size: Group size for quantization, default is 128.
+- to_ollama: Generate the Modelfile required by Ollama. Default is False.
+- 🔥to_mcore: Convert weights from HF format to Megatron format. Default is False.
+- to_hf: Convert weights from Megatron format to HF format. Default is False.
+- mcore_model: Path to the mcore format model. Default is None.
+- thread_count: The number of model slices when `--to_mcore true` is set. Defaults to None, and is automatically configured based on the model size, ensuring that the largest slice is less than 10GB.
+- 🔥test_convert_precision: Test the precision error when converting weights between HF and Megatron formats. Default is False.
+- 🔥push_to_hub: Whether to push to the hub, with the default being False. Examples can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/export/push_to_hub.sh).
+- hub_model_id: Model ID for pushing, default is None.
+- hub_private_repo: Whether it is a private repo, default is False.
+- commit_message: Commit message, default is 'update files'.
+### Sampling Parameters
+- prm_model: The type of process reward model. It can be a model ID (triggered using `pt`) or a `prm` key defined in a plugin (for custom inference processes).
+- orm_model: The type of outcome reward model, typically a wildcard or test case, usually defined in a plugin.
+- sampler_type: The type of sampling. Currently supports `sample` (using `do_sample` method). Future support will include `mcts` and `dvts`.
+- sampler_engine: Supports `pt`, `lmdeploy`, `vllm`, `no`. Defaults to `pt`. Specifies the inference engine for the sampling model.
+- output_dir: The output directory. Defaults to `sample_output`.
+- output_file: The name of the output file. Defaults to `None`, which uses a timestamp as the filename. When provided, only the filename should be passed without the directory, and only JSONL format is supported.
+- override_exist_file: Whether to overwrite if `output_file` already exists.
+- num_sampling_per_gpu_batch_size: The batch size for each sampling operation.
+- num_sampling_per_gpu_batches: The total number of batches to sample.
+- n_best_to_keep: The number of best sequences to return.
+- data_range: The partition of the dataset being processed for this sampling operation. The format should be `2 3`, meaning the dataset is divided into 3 parts, and this instance is processing the 3rd partition (this implies that typically three `swift sample` processes are running in parallel).
+- temperature: Defaults to `1.0`.
+- prm_threshold: The PRM threshold. Results below this value will be filtered out. The default value is `0`.
+- easy_query_threshold: For each query, if the ORM evaluation is correct for more than this proportion of all samples, the query will be discarded to prevent overly simple queries from appearing in the results. Defaults to `None`, meaning no filtering is applied.
+- engine_kwargs: Additional parameters for the `sampler_engine`, passed as a JSON string, for example, `{"cache_max_entry_count":0.7}`.
+- num_return_sequences: The number of original sequences returned by sampling. Defaults to `64`. This parameter is effective for `sample` sampling.
+- cache_files: To avoid loading both `prm` and `generator` simultaneously and causing GPU memory OOM, sampling can be done in two steps. In the first step, set `prm` and `orm` to `None`, and all results will be output to a file. In the second run, set `sampler_engine` to `no` and pass `--cache_files` with the output file from the first sampling. This will use the results from the first run for `prm` and `orm` evaluation and output the final results.
+  - Note: When using `cache_files`, the `--dataset` still needs to be provided because the ID for `cache_files` is calculated using the MD5 of the original data. Both pieces of information need to be used together.
+#### MCTS
+- rollout_depth: The maximum depth during rollouts, default is `5`.
+- rollout_start_depth: The depth at which rollouts begin; nodes below this depth will only undergo expand operations, default is `3`.
+- max_iterations: The maximum number of iterations for MCTS, default is `100`.
+- process_reward_rate: The proportion of process reward used in calculating value during selection, default is `0.0`, meaning PRM is not used.
+- exploration_rate: A parameter in the UCT algorithm that balances exploration; a higher value gives more weight to nodes with fewer explorations, default is `0.5`.
+- api_key: Required when using the client as an inference engine, default is `EMPTY`.
+- base_url: Required when using the client as an inference engine, default is 'https://dashscope.aliyuncs.com/compatible-mode/v1'.
+## Specific Model Arguments
+Specific model arguments can be set using `--model_kwargs` or environment variables, for example: `--model_kwargs '{"fps_max_frames": 12}'` or `FPS_MAX_FRAMES=12`.
+### qwen2_vl, qvq, qwen2_5_vl
+The parameter meanings are the same as in the `qwen_vl_utils` or `qwen_omni_utils` library. You can refer to [here](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)
+- IMAGE_FACTOR: Default is 28
+- MIN_PIXELS: Default is `4 * 28 * 28`
+- 🔥MAX_PIXELS: Default is `16384 * 28 * 28`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/ocr.sh#L3)
+- MAX_RATIO: Default is 200
+- VIDEO_MIN_PIXELS: Default is `128 * 28 * 28`
+- 🔥VIDEO_MAX_PIXELS: Default is `768 * 28 * 28`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L7)
+- VIDEO_TOTAL_PIXELS: Default is `24576 * 28 * 28`
+- FRAME_FACTOR: Default is 2
+- FPS: Default is 2.0
+- FPS_MIN_FRAMES: Default is 4
+- 🔥FPS_MAX_FRAMES: Default is 768, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/video.sh#L8)
+### qwen2_audio
+- SAMPLING_RATE: Default is 16000
+### qwen2_5_omni
+qwen2_5_omni not only includes the model-specific parameters of qwen2_5_vl and qwen2_audio, but also contains the following parameter:
+- USE_AUDIO_IN_VIDEO: Default is False.
+- 🔥ENABLE_AUDIO_OUTPUT: Default is True. If training with zero3, set it to False.
+### internvl, internvl_phi3
+For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)
+- MAX_NUM: Default is 12
+- INPUT_SIZE: Default is 448
+### internvl2, internvl2_phi3, internvl2_5, internvl3
+For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
+- MAX_NUM: Default is 12
+- INPUT_SIZE: Default is 448
+- VIDEO_MAX_NUM: Default is 1, which is the MAX_NUM for videos
+- VIDEO_SEGMENTS: Default is 8
+### minicpmv2_6, minicpmo2_6
+- MAX_SLICE_NUMS: Default is 9, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)
+- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
+- MAX_NUM_FRAMES: Default is 64, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
+### minicpmo2_6
+- INIT_TTS: Default is False
+- INIT_AUDIO: Default is False
+### ovis1_6, ovis2
+- MAX_PARTITION: Default is 9, refer to [here](https://github.com/AIDC-AI/Ovis/blob/d248e34d755a95d24315c40e2489750a869c5dbc/ovis/model/modeling_ovis.py#L312)
+### mplug_owl3, mplug_owl3_241101
+- MAX_NUM_FRAMES: Default is 16, refer to [here](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)
+### xcomposer2_4khd
+- HD_NUM: Default is 55, refer to [here](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)
+### xcomposer2_5
+- HD_NUM: Default is 24 when the number of images is 1. Greater than 1, the default is 6. Refer to [here](https://modelscope.cn/models/AI-ModelScope/internlm-xcomposer2d5-7b/file/view/master?fileName=modeling_internlm_xcomposer2.py&status=1#L254)
+### video_cogvlm2
+- NUM_FRAMES: Default is 24, refer to [here](https://github.com/THUDM/CogVLM2/blob/main/video_demo/inference.py#L22)
+### phi3_vision
+- NUM_CROPS: Default is 4, refer to [here](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)
+### llama3_1_omni
+- N_MELS: Default is 128, refer to [here](https://github.com/ictnlp/LLaMA-Omni/blob/544d0ff3de8817fdcbc5192941a11cf4a72cbf2b/omni_speech/infer/infer.py#L57)
+### video_llava
+- NUM_FRAMES: Default is 16
+## Other Environment Variables
+- CUDA_VISIBLE_DEVICES: Controls which GPU to use. By default, all GPUs are used.
+- ASCEND_RT_VISIBLE_DEVICES: Controls which NPU (effective for ASCEND cards) are used. By default, all NPUs are used.
+- MODELSCOPE_CACHE: Controls the cache path.
+- NPROC_PER_NODE: Pass-through for the `--nproc_per_node` parameter in torchrun. The default is 1. If the `NPROC_PER_NODE` or `NNODES` environment variables are set, torchrun is used to start training or inference.
+- MASTER_PORT: Pass-through for the `--master_port` parameter in torchrun. The default is 29500.
+- MASTER_ADDR: Pass-through for the `--master_addr` parameter in torchrun.
+- NNODES: Pass-through for the `--nnodes` parameter in torchrun.
+- NODE_RANK: Pass-through for the `--node_rank` parameter in torchrun.
+- LOG_LEVEL: The log level, default is 'INFO'. You can set it to 'WARNING', 'ERROR', etc.
+- SWIFT_DEBUG: During `engine.infer(...)`, if set to '1', the content of input_ids and generate_ids will be printed.

ms-swift/docs/source_en/Instruction/Evaluation.md ADDED Viewed

	@@ -0,0 +1,270 @@

+# Evaluation
+SWIFT supports eval (evaluation) capabilities to provide standardized evaluation metrics for both raw models and trained models.
+## Capability Introduction
+SWIFT's eval capability utilizes the EvalScope evaluation framework from the Magic Tower community, which has been advanced in its encapsulation to support the evaluation needs of various models.
+> Note: EvalScope supports many other complex capabilities, such as [model performance evaluation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html), so please use the EvalScope framework directly.
+Currently, we support the evaluation process of **standard evaluation datasets** as well as the evaluation process of **user-defined** evaluation datasets. The **standard evaluation datasets** are supported by three evaluation backends:
+Below are the names of the supported datasets. For detailed information on the datasets, please refer to [all supported datasets](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
+1. Native (default):
+    Primarily supports pure text evaluation, while **supporting** visualization of evaluation results.
+    ```text
+    'arc', 'bbh', 'ceval', 'cmmlu', 'competition_math',
+    'general_qa', 'gpqa', 'gsm8k', 'hellaswag', 'humaneval',
+    'ifeval', 'iquiz', 'mmlu', 'mmlu_pro',
+    'race', 'trivia_qa', 'truthful_qa'
+    ```
+2. OpenCompass:
+    Primarily supports pure text evaluation, currently **does not support** visualization of evaluation results.
+    ```text
+    'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada',
+    'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze',
+    'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval',
+    'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench',
+    'ARC_e', 'COPA', 'ARC_c', 'DRCD'
+    ```
+3. VLMEvalKit:
+    Primarily supports multimodal evaluation and currently **does not support** visualization of evaluation results.
+    ```text
+    'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN',
+    'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11',
+    'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2',
+    'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL',
+    'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar',
+    'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL',
+    'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI',
+    'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST',
+    'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500',
+    'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL',
+    'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME'
+    ```
+## Environment Preparation
+```shell
+pip install ms-swift[eval] -U
+```
+Or install from source:
+```shell
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e '.[eval]'
+```
+## Evaluation
+Supports four methods of evaluation: pure text evaluation, multimodal evaluation, URL evaluation, and custom dataset evaluation.
+**Basic Example**
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift eval \
+    --model Qwen/Qwen2.5-0.5B-Instruct \
+    --eval_backend Native \
+    --infer_backend pt \
+    --eval_limit 10 \
+    --eval_dataset gsm8k
+```
+Where:
+- model: Can specify a local model path or a model ID on modelscope
+- eval_backend: Options are Native, OpenCompass, VLMEvalKit; default is Native
+- infer_backend: Options are pt, vllm, lmdeploy; default is pt
+- eval_limit: Sample size for each evaluation set; default is None, which means using all data; can be used for quick validation
+- eval_dataset: Evaluation dataset(s); multiple datasets can be set, separated by spaces
+For a specific list of evaluation parameters, please refer to [here](./Command-line-parameters.md#evaluation-arguments).
+## Evaluation During Training
+SWIFT supports using EvalScope to evaluate the current model during the training process, allowing for timely understanding of the model's training effectiveness.
+**Basic Example**
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+  --model "Qwen/Qwen2.5-0.5B-Instruct" \
+  --train_type "lora" \
+  --dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \
+  --torch_dtype "bfloat16" \
+  --num_train_epochs "1" \
+  --per_device_train_batch_size "1" \
+  --learning_rate "1e-4" \
+  --lora_rank "8" \
+  --lora_alpha "32" \
+  --target_modules "all-linear" \
+  --gradient_accumulation_steps "16" \
+  --save_steps "50" \
+  --save_total_limit "5" \
+  --logging_steps "5" \
+  --max_length "2048" \
+  --eval_strategy "steps" \
+  --eval_steps "5" \
+  --per_device_eval_batch_size "5" \
+  --eval_use_evalscope \
+  --eval_datasets "gsm8k" \
+  --eval_datasets_args '{"gsm8k": {"few_shot_num": 0}}' \
+  --eval_limit "10"
+```
+Note that the launch command is `sft`, and the evaluation-related parameters include:
+- eval_strategy: Evaluation strategy. Defaults to None, following the `save_strategy` policy
+- eval_steps: Defaults to None. If an evaluation dataset exists, it follows the `save_steps` policy
+- eval_use_evalscope: Whether to use evalscope for evaluation, this parameter needs to be set to enable evaluation
+- eval_datasets: Evaluation datasets, multiple datasets can be set, separated by spaces
+- eval_datasets_args: Evaluation dataset parameters in JSON format, parameters for multiple datasets can be set
+- eval_limit: Number of samples from the evaluation dataset
+- eval_generation_config: Model inference configuration during evaluation, in JSON format, default is `{'max_tokens': 512}`
+More evaluation examples can be found in [examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval).
+## Custom Evaluation Datasets
+This framework supports two predefined dataset formats: multiple-choice questions (MCQ) and question-and-answer (QA). The usage process is as follows:
+*Note: When using a custom evaluation, the `eval_backend` parameter must be set to `Native`.*
+### Multiple-Choice Question Format (MCQ)
+This format is suitable for scenarios involving multiple-choice questions, and the evaluation metric is accuracy.
+**Data Preparation**
+Prepare a CSV file in the multiple-choice question format, structured as follows:
+```text
+mcq/
+├── example_dev.csv  # (Optional) The filename should follow the format `{subset_name}_dev.csv` for few-shot evaluation
+└── example_val.csv  # The filename should follow the format `{subset_name}_val.csv` for the actual evaluation data
+```
+The CSV file should follow this format:
+```text
+id,question,A,B,C,D,answer
+1,Generally speaking, the amino acids that make up animal proteins are____,4 types,22 types,20 types,19 types,C
+2,Among the substances present in the blood, which is not a metabolic end product?____,Urea,Uric acid,Pyruvate,Carbon dioxide,C
+```
+Where:
+- `id` is an optional index
+- `question` is the question
+- `A`, `B`, `C`, `D`, etc. are the options, with a maximum of 10 options
+- `answer` is the correct option
+**Launching Evaluation**
+Run the following command:
+```bash
+CUDA_VISIBLE_DEVICES=0 \
+swift eval \
+    --model Qwen/Qwen2.5-0.5B-Instruct \
+    --eval_backend Native \
+    --infer_backend pt \
+    --eval_dataset general_mcq \
+    --dataset_args '{"general_mcq": {"local_path": "/path/to/mcq", "subset_list": ["example"]}}'
+```
+Where:
+- `eval_dataset` should be set to `general_mcq`
+- `dataset_args` should be set with:
+    - `local_path` as the path to the custom dataset folder
+    - `subset_list` as the name of the evaluation dataset, taken from the `*_dev.csv` mentioned above
+**Running Results**
+```text
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Model               | Dataset     | Metric          | Subset   |   Num |   Score | Cat.0   |
++=====================+=============+=================+==========+=======+=========+=========+
+| Qwen2-0.5B-Instruct | general_mcq | AverageAccuracy | example  |    12 |  0.5833 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+```
+## Question-and-Answer Format (QA)
+This format is suitable for scenarios involving question-and-answer, and the evaluation metrics are `ROUGE` and `BLEU`.
+**Data Preparation**
+Prepare a JSON Lines file in the question-and-answer format, containing one file in the following structure:
+```text
+qa/
+└── example.jsonl
+```
+The JSON Lines file should follow this format:
+```json
+{"query": "What is the capital of China?", "response": "The capital of China is Beijing"}
+{"query": "What is the highest mountain in the world?", "response": "It is Mount Everest"}
+{"query": "Why can't penguins be seen in the Arctic?", "response": "Because most penguins live in Antarctica"}
+```
+**Launching Evaluation**
+Run the following command:
+```bash
+CUDA_VISIBLE_DEVICES=0 \
+swift eval \
+    --model Qwen/Qwen2.5-0.5B-Instruct \
+    --eval_backend Native \
+    --infer_backend pt \
+    --eval_dataset general_qa \
+    --dataset_args '{"general_qa": {"local_path": "/path/to/qa", "subset_list": ["example"]}}'
+```
+Where:
+- `eval_dataset` should be set to `general_qa`
+- `dataset_args` is a JSON string that needs to be set with:
+    - `local_path` as the path to the custom dataset folder
+    - `subset_list` as the name of the evaluation dataset, taken from the `*.jsonl` mentioned above
+**Running Results**
+```text
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Model               | Dataset     | Metric          | Subset   |   Num |   Score | Cat.0   |
++=====================+=============+=================+==========+=======+=========+=========+
+| Qwen2-0.5B-Instruct | general_qa  | bleu-1          | default  |    12 |  0.2324 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | bleu-2          | default  |    12 |  0.1451 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | bleu-3          | default  |    12 |  0.0625 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | bleu-4          | default  |    12 |  0.0556 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-1-f       | default  |    12 |  0.3441 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-1-p       | default  |    12 |  0.2393 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-1-r       | default  |    12 |  0.8889 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-2-f       | default  |    12 |  0.2062 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-2-p       | default  |    12 |  0.1453 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-2-r       | default  |    12 |  0.6167 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-l-f       | default  |    12 |  0.333  | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-l-p       | default  |    12 |  0.2324 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+| Qwen2-0.5B-Instruct | general_qa  | rouge-l-r       | default  |    12 |  0.8889 | default |
++---------------------+-------------+-----------------+----------+-------+---------+---------+
+```

ms-swift/docs/source_en/Instruction/Frequently-asked-questions.md ADDED Viewed

	@@ -0,0 +1,716 @@

+# Frequently-asked-questions
+Here are some common questions encountered during the use of Swift.
+## Training
+### Q1: What models and datasets are supported for fine-tuning in Swift?
+Please refer to the documentation on [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html).
+### Q2: What data formats are supported when training with custom datasets?
+For custom dataset formats, see the documentation on [Custom Dataset](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html).
+### Q3: What is the format for dataset_info.json for custom datasets, and how can I use it?
+The dataset_info.json format can be found in the documentation on [Custom Dataset](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html). Use the command line with `--custom_dataset_info xxx.json`, `--dataset <dataset_id_or_path>`.
+### Q4: How can I train with a custom dataset using the interface?
+Using a custom dataset through the interface is the same as using the command line. Refer to the documentation on [Custom Dataset](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html).
+### Q5: Can I write a line in the jsonl file like this? {"index": "00000", "query": "11111", "response": "22222", 'source':'qqq'}
+You can have additional fields that won't be used.
+### Q6: Where can I find the command line parameters?
+Please refer to the documentation on [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q7: What parameters need to be configured for training in an offline environment?
+Use `--model local_path`, `--check_model false`. For more details, see the [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q8: Where can I check model_type?
+Check the [Supported Models and Datasets](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html).
+### Q9: Can I directly convert the model to gguf format after training?
+Currently, only export to ModelFile is supported. See the [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q10: Does Swift support pre-training? I only see SFT.
+Yes, it supports it. Use the command line `swift pt`, [pt example](https://github.com/modelscope/ms-swift/tree/main/examples/train/pretrain). The dataset format is detailed in [Custom Dataset](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html).
+### Q11: For models fine-tuned with LoRA, should I merge them into one model for resuming training, or can I specify the original model and LoRA block by path directly?
+You do not need to merge. Use `--resume_from_checkpoint output/xxx/vx-xxx/checkpoint-xxx`. See the [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q12: I would like to control the location where the original model weights downloaded from the internet are stored. How can I place the original model in a specific folder?
+You can set the environment variable `MODELSCOPE_CACHE=your_path` to store the original model in the specified path. For SDK downloads, use `cache_dir="local_path"`. You can also use the `modelscope download` command-line tool or `git` to download it. For details, refer to the [Download Model](https://modelscope.cn/docs/Models/Download-Model). During training, set `--model` to the local path. For offline training, configure `--check_model false`. See the [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q13: Has anyone encountered this issue with ms-swift?
+```text
+[rank6]: pydantic_core._pydantic_core.ValidationError: 1 validation error for DeepSpeedZeroConfig
+[rank6]: stage3_prefetch_bucket_size
+[rank6]: Input should be a valid integer, got a number with a fractional part [type=int_from_float,input_value=11560550.4，in put_type=float]
+[rank6]: For further information visit https://errors.pydantic.dev/2.8/v/int_fro_float
+```
+Downgrade `deepspeed` to `0.14.*`.
+### Q14: Is there a complete tutorial and command line for fine-tuning Qwen-2-VL?
+Reference the [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal) for multimodal model training.
+### Q15: Are there any tricks supported for fine-tuning multi-modal large models, similar to the LLM's neftune?
+You can try variations of `lora` like `piassa/olora/dora`, or `fourierft`. Refer to the tricks in the `sft` parameters, as some may not apply to multi-modal.
+### Q16: The accuracy from eval during training and the accuracy computed from re-inference with the saved checkpoint are not consistent.
+The methods for calculating eval accuracy during training and inference are different. The default `acc_strategy` is `token`, and the selectable values are: `token`, `sentence`.
+### Q17: Official Magic Mirror image and Swift environment.
+You can start the container using `docker run`, for example: `docker run --gpus all -p 8000:8000 -it -d --name ms registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.1.0-py310-torch2.3.0-tf2.16.1-1.16.0 /bin/bash`. After starting the container, pull the latest code to install Swift.
+### Q18: Command line for multi-machine multi-card training.
+For details, see the [Multi-node Example](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node).
+### Q19: How to choose a template?
+See [issue](https://github.com/modelscope/ms-swift/issues/1813).
+### Q20: How to use torchrun and swift sft for multi-card training?
+`swift sft` uses `torchrun`.
+### Q21: I have a question about my SFT dataset being too large; tokenizing takes a long time. Is there a solution?
+Use `lazy_tokenize`. See [Command Line Parameters documentation](https://swift.readthedocs.io/zh-cn/latest/Instruction/%E5%91%BD%E4%BB%A4%E8%A1%8C%E5%8F%82%E6%95%B0.html).
+### Q22: When two datasets are simply appended together in the training set, does the model shuffle internally during training, or does it take data in order to train?
+The trainer will shuffle randomly.
+### Q23: If the model is on two cards and the data is not parallelized, deepspeed will throw an error. How to handle this?
+`deepspeed` and `device_map` are incompatible; you can only choose one.
+### Q24: Why does it need to download again when retraining offline, despite having already downloaded the dataset online?
+The data file contains URLs, which do not support offline training.
+### Q25: How to reduce GPU memory usage when training VLM models?
+Set `--freeze_vit true`.
+### Q26: Why are there fewer models supported in the WEB-UI interface than in the documentation?
+Upgrade `ms-swift`.
+### Q27: For models that do not have a suitable model_type, can I customize special_tokens and chat_template during SFT?
+Yes, you can. Refer to the PR for model integration and the custom model dataset documentation.
+### Q28: Can I use DPO to train Qwen2-VL in a Python script?
+Yes, import `rlhf_main` and `RLHFArguments` from `swift.llm`.
+### Q29: Can I pre-train with pure text before fine-tuning on a VQA dataset for MLLM?
+Yes, you can mix training as well.
+### Q30: When conducting DPO training based on the qwen2 SFT model on a V100 machine, the training shows NaN?
+Use fp32 for training with the V100 machine.
+### Q31: Does Swift support distillation?
+Refer to this [example](https://github.com/modelscope/ms-swift/blob/main/examples/sampler/distill/distill.sh).
+### Q32: Encountered the error `cannot import name 'ftp_head' from 'datasets.utils.file_utils'`. Has anyone faced this issue?
+Try `pip install datasets==2.*`.
+### Q33: The default maximum number of checkpoints saved after training is two. How can I increase this number?
+Use `--save_total_limit`. See the [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q34: In grounding tasks, does the universal data format support multiple instances for one category?
+Currently, it supports one object corresponding to multiple bounding boxes. Refer to the documentation on [Custom Dataset](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html).
+### Q35: Why am I getting the error that numpy.object cannot be found?
+Try using `numpy==1.26.3`.
+### Q36: Does the Swift framework support sequence parallelism now?
+Yes, it supports it. It implements this using `xtuner`.
+### Q37: When fine-tuning qwen2-1.5B on a V100, I see `loss': 0.0, 'acc': 0.0, 'grad_norm': nan`. What is the issue?
+Try using fp32.
+### Q38: Is it possible to fully fine-tune GPTQ quantized models?
+No, GPTQ model's int-type parameters cannot participate in gradients; they can only be updated with additional structures like LoRA.
+### Q39: What parameters should I set for fine-tuning using QLoRA on glm4-chat?
+Refer to the QLoRA [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora).
+### Q40: I encounter the issue "AdamW' object has no attribute 'train" when training my dataset on qwen2-vl-7b.
+Try `accelerate 0.34.0`.
+### Q41: How do I expand my vocabulary within the Swift framework?
+Swift currently does not support vocabulary expansion.
+### Q42: Can I directly use models with the same name from Hugging Face?
+Set the environment variable `USE_HF=1`.
+### Q43: Can Qwen2-VL-2B conduct incremental pre-training? Is there guidance available?
+Yes, it supports incremental pre-training. Just include all the content in the response.
+### Q44: When training with videos, how can I control the frame sampling rate in the parameters? The `frame_rate` setting doesn't seem to work, and I'm using MiniCPMV.
+Set the environment variable `MAX_NUM_FRAMES`.
+### Q45: Can I save the inference results of the validation set during training in Swift?
+After training, run `swift infer` to save the results.
+### Q46: Why is the saved checkpoint larger than the original model file after full parameter DPO?
+Using V100 for fine-tuning stores the data in fp32 format.
+### Q47: Training speed slows down when using multi-machine training; using Swift framework for LLM training with deepspeed zero3 causes significant performance drop.
+See the [issue](https://github.com/modelscope/ms-swift/issues/1825).
+### Q48: Does Swift now support multi-stage pre-training for qwen2-vl? It looks like the official best practices only show SFT training with vit+llm together, not sure if separate fine-tuning is supported.
+Refer to the [issue](https://github.com/modelscope/ms-swift/issues/2222).
+### Q49: Does qwen2-vl support mixing pure text data?
+It supports both mixed visual-text and pure text data.
+### Q50: Can I plot loss curves for different datasets during fine-tuning?
+This is not supported; datasets are trained in a mixed manner.
+### Q51: After model training, the responses have a lot of repeated content.
+Refer to the [Pre-training and Fine-tuning](https://swift.readthedocs.io/en/latest/Instruction/Pre-training-and-Fine-tuning.html). If you notice repetitions during training, try training for more epochs, cleaning the data, and conducting full parameter training, using RLHF to mitigate this issue.
+### Q52: Does Swift currently support prompt tuning or prefix tuning?
+No, it does not support these methods, as both methods suffer from serious forgetting issues and are not currently recommended.
+### Q53: I encountered the following error when training with two A10s:
+```text
+[rank0]: torch.distributed.DistBackendError: NCCL error in:../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1970， unhandled system error (run with NCCL_DEBUG=INFO for details),NCCL version 2.20.5
+[rank0]:ncclSystemError: System call (e.g. socket,malloc) or external library call failed or device error.
+```
+Please check if shared memory is too small; NCCL requires shared memory.
+### Q54: How to solve the issue of certain parameters not participating in backpropagation when freezing layers during DDP fine-tuning?
+Set the parameter `--ddp_find_unused_parameters true`.
+### Q55: Does Swift have a dataset quality inspection tool?
+[data-juicer](https://github.com/modelscope/data-juicer).
+### Q56: Where to start model parallelism on the web? I only found the option to check for data parallelism.
+You can specify visible GPUs to enable model parallelism.
+### Q57: How can I turn off automatic shuffling?
+Currently, you can only modify the [transformers code](https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py).
+### Q58: What is the parameter 'num_items_in_batch'? I can't find it.
+Upgrade to `ms-swift==2.5.2` or downgrade to `transformers<4.46`.
+### Q59: How can I set a fixed location for dataset downloads when using --dataset? I can't find this in command line parameters. How can I read from the download location next time?
+`dataset_path` supports folders, typically for datasets downloaded via `git clone`. See [Custom Dataset Documentation](https://swift.readthedocs.io/en/latest/Customization/Custom-dataset.html#dataset-info-json).
+### Q60: When using --streaming true, I get an error asking me to set max_steps when setting num_train_epochs. Can't I just set num_train_epochs?
+Streaming dataset loading requires setting `max_steps`.
+### Q61: Why is tools in "[]" format rather than directly using []? Could you explain why tools uses this "[]" format instead of direct [] notation?
+This is because the underlying pyarrow in datasets has strict type control. For the same reason, the objects part in our official grounding dataset also uses str, otherwise pyarrow would report errors about inconsistent types across rows.
+### Q62: Can't this parameter be used? check_dataset_strategy==discard
+This parameter no longer exists in swift3.0, use the `strict` parameter instead.
+### Q63: Getting this error when running sft command:
+```text
+RuntimeError: Expected to mark a variable ready only once.This error is caused by one of the following reasons: 1) Use of a module parameter outsid forward function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph( ) as round if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple oint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph( ) as a workaround if dule graph does not change over iterations.
+```
+Add this parameter: `--gradient_checkpointing_kwargs '{"use_reentrant": false}'`.
+### Q64: Have you encountered this issue? AttributeError:'TrainerState' object has no attribute 'last_model_checkpoint'
+Dataset is too small, need to add more data. Error occurs when data quantity is less than one step.
+### Q65: I see preprocess can be defined in CustomPreprocessor. Is this processed all at once before training starts, or loaded during training?
+If `--streaming true` is set, it loads while training. By default, it processes everything before training.
+### Q66: For full-parameter training of internvl2_5, why do vision_model and mlp1 appear in freeze parameters by default? Documentation shows freeze_parameters defaults to [], and command line settings for freeze vit, freeze aligner, freeze llm are all False. It prints trainable parameters: ['mlp1'] - unclear if only mlp1 is trainable or all parameters
+First freeze parameters then active parameters. The three parameters `freeze vit/freeze aligner/freeze llm` adjust freeze parameters and trainable parameters. Since some models' `vit` contains `aligner`, aligner is separately added to trainable_parameters.
+### Q67: Does LlamaPro in swift support multimodal adaptation?
+Yes, it's supported.
+### Q68: I noticed 2.x supports MAX_PIXELS. Is the --max_pixel parameter in 3.x documentation the same thing? What's the processing logic? Using 12000*9000 images with internvl still crashes in 2.x even with resacle_image
+Environment variable parameters correspond to model parameters. `MAX_PIXELS` only supports qwen2vl, internvl has its own environment variables. See [Specific Model Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html#specific-model-argumen).
+### Q69: Is there documentation for fine-tuning qwen base model to chat model? Any special configurations needed?
+Use `swift sft`, no special configuration needed. See [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/base_to_chat).
+### Q70:  Where can I find sequence parallel examples?
+See this example: [sequence_parallel](https://github.com/modelscope/ms-swift/tree/main/examples/train/sequence_parallel).
+### Q71: Can swift support training custom model structures?
+Yes, just customize the `get_model_tokenizer_xxx` function to return `model` and `tokenizer`.
+### Q72: Getting an error using longlora with "name_or_path": "/mnt/workspace/model/Qwen2.5-14B-Instruct". Is longlora only for llama series?
+Yes, `longlora` only works with llama series.
+### Q73: How to add custom special tokens in swift?
+Add them in the `get_model_tokenizer` function.
+### Q74: For --freeze_parameters_ratio parameter, if set to 0.7, does it mean only 30% of llm parameters are updated during training? Is it random 30%? What's the update mechanism?
+Freezes from bottom to top.
+### Q75: Why is the map process so slow? Is this normal?
+```text
+Map: 4%|██ | 9000/203823 [02:18<50:34, 64.19 examples/s]
+```
+Use `--dataset_num_proc` parameter to enable multiple processes.
+### Q76: How can I delete and redownload a dataset? I think there might be an issue with the dataset.
+Set the `--download_mode` parameter.
+### Q77: How to solve this error: safetensors_rust.SafetensorError: Error while deserializing header: HeaderTooLarge?
+The disk space is insufficient, and the model wasn't saved completely.
+### Q78: Does swift3.0 not support get_default_template_type?
+Please check `model.model_meta.template`, the information is available in `model.model_meta` and `model.model_info`.
+### Q79: Does ModelScope Swift support hermes format agent fine-tuning? I see qwen2.5 uses vllm with native support for hermes format tool calling, why don't I see it in Swift?
+Currently, `hermes` format is not supported. We mainly support `toolbench` and `react` formats, as `react` is more widely used. Swift's deploy currently supports parsing these two formats and provides `openai tool calling`.
+### Q80: Is the default model training using left padding?
+Training can use either left or right padding. The default is right padding, while `batch infer` uses left padding.
+### Q81: Does it support grounding tasks now?
+Yes, there's an [example](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/grounding.sh) under examples.
+### Q82: Does ms-swift support contrastive learning for training llm_emb?
+Yes, here's an [example](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train.sh).
+### Q83: Is there a big difference in performance between manually coding fine-tuning and GRPO using peft and trl libraries compared to Swift official training with the same parameters?
+The difference is minimal, with Swift additionally supporting multimodality.
+### Q84: Does Swift currently not support audio modal input training for minicpmo2_6? It shows error: assert media_type in {'image', 'video'}
+Audio is not currently supported.
+### Q85: Can Swift fine-tune deepseek R1 671B?
+Yes, the template is integrated, but the process is complicated as it requires converting fp8 to bf16 first.
+### Q86: Isn't the latest Swift framework supposed to specify the model location using this command? This is the location of the model I've already downloaded, but I don't know why it still tries to download and fails with a git clone error
+```shell
+--model /mnt/workspace/.cache/modelscope/hub/deepseek-ai/deepseek-vl2/ \
+```
+Some models require cloning the repo and then specifying through `local_repo_path`.
+### Q87: Does Swift now support multimodal GRPO?
+Yes, it does.
+### Q88: Can the GRPO reward function be customized?
+Yes, refer to [examples/train/grpo/plugin](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin).
+### Q89: Why do I get the error when using --torch_dtype float16 (card cannot use bf16): lib/python3.12/site-packages/torch/amp/grad_scaler.py", line 260, in unscale_grads raise ValueError("Attempting to unscale FP16 gradients.") ValueError: Attempting to unscale FP16 gradients.
+FP16 does not support full-parameter training.
+### Q90: I have a question. I trained a reward model using Swift (baseline is qwen2.5-7b), but when loading it in PPO or GRPO, it shows an error. The reward model was trained using LoRA.
+```shell
+--rlhf_type ppo \
+--model Qwen/Qwen2.5-14B-Instruct \
+--reward_model /mnt/workspace/output/rm/model --train_type lora \
+--dataset 'AI-ModelScope/alpaca-gpt4-data-zh#20000' --torch_dtype float32 --num_train_epochs 1 \
+--per_device_train_batch_size 1 --per_device_eval_batch_size 1 --learning_rate 1e-5 --lora_rank 8 --lora_alpha 32 \
+--target_modules all-linear \
+--gradient_accumulation_steps 16 --eval_steps 100 --save_steps 100 \
+```
+The LoRA-trained reward model needs to be merged.
+### Q91: What version of transformers is needed to fine-tune deepseek_vl2? Official docs say <4.42, but it also shows errors with 4.42 and below. Does the peft version need to be lowered too?
+Use `peft==0.11.*`.
+### Q92: Generate train split is too slow (about 30+ datasets with around a million total data points). Previously Swift 2.x wasn't this slow. Lazy tokenize is already enabled.
+Set `--dataset_num_proc 16`.
+### Q93: How can I full-parameter fine-tune the visual encoder while using LoRA to fine-tune LLM when fine-tuning qwen2.5vl?
+Refer to this [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/lora_llm_full_vit).
+### Q94: How to use custom loss functions in Swift?
+Add it in the plugin.
+### Q95: What are the parameters for MoE? Can't find keywords in the parameter table. How to set expert numbers and expert routing parameters?
+Use parameters directly from `config.json`.
+### Q96: Using lmdeploy in grpo training reports missing functions. The load_weights function isn't found in lmdeployengine class.
+Only supported under turbomind engine.
+### Q97: Getting errors when fine-tuning Moonlight-16B-A3B-Instruct model. Seems ms-swift doesn't support fine-tuning this model?
+Training is disabled in model files. Refer to deepseek_vl2's solution in the issues.
+### Q98: How to solve this error: RuntimeError: "triu_tril_cuda_template" not implemented for 'BFloat16'?
+```shell
+CUDA_VISIBLE_DEVICES=01,2,3,4,5,6,7 \
+swift sft \
+    --model Internlm3-8b \
+    --dataset train.json \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 1 \
+    --deepspeed zero3 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4
+```
+Upgrade torch.
+### Q99: Does it support custom rewards?
+Yes, please check this [example](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin).
+### Q100: Is it normal that both loss and grad_norm are 0 during GRPO training?
+```text
+{'loss':    0.0.    'grad norm':0.0,    'learning_rate':9e-08,    'memory(GiB)':88.1，    'train_speed(iter/s)':0.009252，    'completion_length':    150.00000763，    'response_clip ratio': 0.0,    'rewards/Format':1.0,    'reward
+: 1.0,    'reward std':0.0，    'kl': 0.0, 'clip_ratio': 0.0,    'epoch': 0.0， 'qlobal step/max steps':'1/1052'，    'percentage':'0.10%    'elapsed time':    '36s    'remaining time': '10h 43m 54s'}
+{'loss': 0.0，'grad_norm':0.0，'learning_rate': 1.8e-07,'memory(GiB)':94.15，'train_speed(iter/s)':0.014782，'completion_length': 133.25000763，'response_clip_ratio': 0.0，'rewards/Format': 1.0, 'rewa rd': 1.0，'reward_std': 0.0, 'kl': 0.0，'clip_ratio': 0.0,'epoch': 0.0, 'global_step/max_steps': '2/1052'，'percentage': '0.19%', 'elapsed_time': '1m 3s'， 'remaining_time': '9h 19m 49s'}
+{'loss': 0.0， 'qrad norm': 0.0, 'learning rate': 2.7e-07,'memory(GiB)': 94.15，'train_speed(iter/s)': 0.018695，'completion_length': 123.08333969，，'response_clip_ratio': 0.0，'rewards/Format': 1.0, 'rewa rd': 1.0， 'reward_ std': 0.0,'kl': 0.0,'clip_ratio': 0.0， 'epoch': 0.0， 'global_step/max_steps': '3/1052'，'percentage': '0.29%，'elapsed_time': '1m 29s'，'remaining_time': '8h 39m 34s'}
+```
+Training with loss close to 0 is normal, refer to this [issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851).
+### Q101: Where can I pass in accuracy_orm for GRPO's built-in reward function?
+Currently it requires modifying the code directly.
+### Q102: I notice the reward function has a solution parameter, does it need to be passed from the dataset? Does my dataset must have a solution field?
+Yes, it's necessary for math problems to calculate accuracy.
+### Q103: Why is there no token_acc during training?
+Some models have mismatched `logits` and `labels` counts, so token accuracy isn't calculated.
+### Q104: When fine-tuning Ovis2, LoRA parameters don't seem to work? Memory usage doesn't change with or without --train_type lora.
+Limit `--max_length`, this model is special and needs padding to max_length.
+### Q105: Getting ValueError when running classification task with Qwen2.5: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.
+dataset format: {"messages": [{"role": "user", "content": "xxxxx"}, {"label": 1}]}
+Put `label` at the same level as `messages`, not inside it.
+### Q106: Does anyone know what's wrong with this? The training method is VERA
+```text
+KeyError("The 'metric_for_best_model' training argument is set to 'eval_loss', which is not found in the evaluati on metrics. The available evaluation metrics are:['eval_runtime', 'eval_samples_per_second', 'eval_steps_per_sec ond', 'epoch', 'global_step/max_steps', 'percentage', 'elapsed_time', 'remaining_time']. consider changing the 'metric_for_best_model' via the TrainingArguments.")
+Train: 45%|    100/220[09:47<11:44，5.87s/it]
+```
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset '/mnt/workspace/data.json' \
+    --train_type vera \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 4 \
+    --per_device_train_batch_size 1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --warmup_ratio 0.05 \
+    --output_dir output/Qwen2.5-7B-vera \
+```
+Add `--label_names labels` parameter.
+### Q107: How to exit VllmEngine? I want to release GPU memory after inference rather than keeping it occupied.
+Use sleep mode: `engine.sleep(level=1)/engine.wake_up()` with `enable_sleep_mode=True` during initialization.
+### Q108: Does trainer_sampler_random have no effect in streaming mode?
+Streaming is not random.
+### Q109: Can trust_remote_code be set when using VLLM for GRPO training?
+It's true by default.
+### Q110: For large dataset pretraining using streaming and packing, is there a way to calculate total steps based on epochs, batch size etc when setting max_steps?
+Training will end based on whichever is smaller between `epochs` and `max_steps`.
+### Q111: Unsloth training error: "assert(type(target modules) in (list,tuple,))" when using --target_modules all-linear
+Don't use `all-linear`, specify concrete module list like `--target_modules q k v`.
+### Q112: Does Swift support multi-label classification now?
+Yes. Check custom dataset docs for format and search for `problem_type` in command line parameter docs.
+### Q113: How does flash_attn handle packing - separately or merged?
+Flash attention is required to avoid errors, otherwise attention_mask will have issues.
+## Inference
+### Q1: Is there documentation for Swift inference?
+Swift supports inference via Python scripts, command line, and UI interface. See the [Inference and Deployment](https://swift.readthedocs.io/en/latest/Instruction/Inference-and-deployment.html).
+### Q2: How to use the trained model for inference with a dataset?
+Use the parameters `--load_data_args true` or `--val_dataset <your-val-dataset>`. Refer to the [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q3: Can I specify a locally saved model during Swift inference?
+Set `--model` to the local path. See [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+### Q4: How do I infer on a dataset without labels? I see that the dataset format in the documentation is all for the training set.
+Configure the parameter `--val_dataset <your-val-dataset>`.
+### Q5: How to resolve the error `ValueError: Input length of input_ids is 35, but max_length is set to 20`?
+```text
+raise ValueError(
+ValueError: Input length of input_ids is 35, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
+```
+Set `model.generation_config.max_new_tokens`.
+### Q6: qwen2-vl inference (training) runs out of memory
+Set the command line parameter `--max_pixels xxx`, environment variable `MAX_PIXELS=xxx`, or specific model parameter `--model_kwargs '{"max_pixels": xxx}'`. Note that the environment variable only takes effect for the corresponding models in the documentation. For more details, please refer to the documentation [Specific Model Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html#specific-model-arguments).
+### Q7: On a V100 GPU, in a Python virtual environment, following the environment setup instructions from https://swift2x.readthedocs.io/zh-cn/latest/Multi-Modal/qwen2-vl%E6%9C%80%E4%BD%B3%E5%AE%9E%E8%B7%B5.html, when testing the inference command: `CUDA_VISIBLE_DEVICES=0,1,2,3 swift infer --model_type qwen2-vl-7b-instruct`, an error occurs: `RuntimeError: probability tensor contains either 'inf', 'nan' or element < 0`.
+Try inference on A10 or 3090 machines.
+### Q8: After running the prediction command, where are the results saved? CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir output/glm4v-9b-chat/vx-xxx/checkpoint-xxx-merged --load_data_args true
+Results will be printed in the log.
+### Q9: For the latest version of swift, can the infer command output probability values through the logprobs parameter?
+Yes, logprobs can be output. For command line inference, set `--logprobs true`. For Python script inference, set `request_config = RequestConfig(..., logprobs=True, top_logprobs=2)`. Please refer to [test_logprobs.py](https://github.com/modelscope/ms-swift/blob/main/tests/infer/test_logprobs.py).
+### Q10: In the latest version of Swift, while loading the qwen2-32b-instruct-awq quantized model, I was advised to add merge-lora true. After doing this, it throws an error. When I omit it, inference works but slowly.
+Models trained with QLoRA do not support merge-lora; it is recommended to merge-lora after fine-tuning and then quantize.
+### Q11: Getting the error `assert factor in rope_scaling` with vllm?
+For more details, see qwen2-vl [issue#96](https://github.com/QwenLM/Qwen2-VL/issues/96).
+### Q12: Does vllm require the models to be merged before calling them during inference?
+Models do not have to be merged. See the documentation on [Command Line Parameters](https://swift.readthedocs.io/en/latest/Instruction/Command-line-parameters.html).
+Q13: How to use CPU when performing inference with Python scripts?
+Set the environment variable: `os.environ['CUDA_VISIBLE_DEVICES'] = '-1'`.
+### Q14: Has anyone encountered the error `RuntimeError: "triu_tril_cuda_template" not implemented for'BFloat16'`?
+Upgrade Torch, as the current version may not have implemented this operator.
+### Q15: Does qwen2-audio support streaming inference?
+Yes, see the [issue](https://github.com/modelscope/ms-swift/issues/1653).
+### Q16: Where to set `do_sample` for multi-modal inference using inference client?
+Set `temperature=0`.
+### Q17: Does ms-swift support batch processing for large models?
+Supported.  See the [demo](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py).
+### Q18: When quantizing models with ms-swift, there is an insufficient memory display. Can we reduce resource usage during quantization, even if it's slower?
+Try setting `--device_map cpu`.
+### Q19: Does Swift support quantization for multi-modal models?
+Yes, it supports quantization.
+### Q20: Encountering the following error while using GPTQ, what is the cause?
+```text
+if llm_config['architectures'][0] == 'LlamaForCausalLM':
+KeyError: 'architectures'
+```
+Try using `transformers==4.44.*`.
+### Q21: How can I specify where to save evaluation results during swift infer? I can't find where the results are saved.
+Set `--result_path your_path`. See [InferArguments](https://github.com/modelscope/ms-swift/blob/main/swift/llm/argument/infer_args.py).
+### Q22: I get an error while using AWQ quantized yi-vl-6b:
+```text
+TypeError: swift.llm.utils.model.get_model_tokenizer_with_flash_attn() got multiple values for keyword argument 'automodel_class'.
+```
+Please use GPTQ quantization.
+### Q23: I would like to ask about using swift export to perform GPTQ INT4 quantization on the qwen2.5 72B model with a max model length of 32768, which is the default value. The calibration dataset provided has 128 samples, but an error occurred during quantization. The error log is: "factorization could not be completed because the input is not positive-definite (the leading minor of order 18145 is not positive-definite)." What is the cause?
+This indicates a problem with the Hessian matrix being non-positive definite. Try using a different dataset.
+### Q24: Can batch inference only be done through custom code? Can't it be done like SFT with script parameters?
+Yes, it can be done using `swift infer --val_dataset xxx --max_batch_size 16 ...`.
+### Q25: What's the default temperature value when using swift app for inference?
+It's read from `generation_config.json` by default.
+### Q26: Can export and quantization be done using multiple GPUs?
+Model loading can use multiple GPUs, but quantization is single-GPU only.
+### Q27: When using swift export with a custom template_type, does it permanently change the template_type? If we use swift export --template_type custom, does it change the model's template?
+No, it won't be modified. Templates in swift are defined internally, not saved in jinja format.
+### Q28: AWQ quantization for Qwen2VL gives error: TypeError: Qwen2VLForConditionalGeneration.init() got an unexpected keyword argument 'use_cache'
+Use `gptq` quantization instead.
+### Q29: For DDP inference, does max_batch_size in infer refer to batch size per GPU or total batch size?
+It refers to batch size per GPU.
+### Q30: Does swift.inference now support messages format input? It seems to only support query format currently. The answer contains part of the prompt, how should I modify the inference to complete the answer?
+```text
+{"messages": [{"role": "system", "content": "<system>"}, {"role": "user", "content": "<query1>"}, {"role": "assistant", "content": "answer1, "}]}
+```
+This is supported in swift3, refer to [examples/infer/demo_agent](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_agent.py).
+### Q31: How can I make swift infer write results to result_path in real-time instead of writing everything at once at the end?
+```shell
+swift infer \
+--ckpt_dir model_dir \
+--streaming true \
+--val_dataset dataset.jsonl \
+--result_path result.jsonl
+```
+Use `--stream true`. This will write results one by one, but it's non-batch inference.
+### Q32: When I trained and did inference in Swift it worked, but after merge_lora when using Ollama's API the effect disappeared.
+Try loading with transformers, Swift's template is aligned with transformers.
+### Q33: Which parameter should I set if I need to continue inference under a specific prefix during model inference?
+The parameter `--response_prefix`.
+### Q34: How do I fix this error that keeps appearing?
+```text
+File "/mnt/workspace/swift/swift/1lm/dataset/preprocessor/core. py", line 69, in _check_messages raise
+ValueError(f'assistant_message; {assistant_message}')
+ValueError: assistant_message: {'role' :'assistant', 'content': ''}
+```
+```shell
+CUDA_VISIBLE_DEVICES=0 NPROC_PER_NODE=1 MAX_PIXELS=1003520 swift sft --model Qwen/Qwen2.5-VL-7B-Instruct --train_type lora --dataset /mnt/workspace/data.json --deepspeed zero2 --max_length 16384
+```
+The assistant field in the dataset is empty. If this is for inference, delete this empty string because it will cause NaN during training and will be checked.
+## Deployment
+### Q1: How to deploy a trained model?
+Use `swift deploy --adapters xxx`. Refer to the documentation on [Inference and Deployment](https://swift.readthedocs.io/en/latest/Instruction/Inference-and-deployment.html).
+### Q2: How to use vllm for multi-card deployment?
+For details, see the [example](https://github.com/modelscope/ms-swift/tree/main/examples/deploy).
+### Q3: How can clients pass images during vllm deployment?
+See [client examples](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client/mllm) for details..
+### Q4: I have a question about deploying qwen2-7b and using it with a client. When calling the OpenAI API, should I use `client.completions.create` instead of `client.chat.completions.create`, but when using `qwen2-7b-instruct-q5_k_m.gguf`, I can use `client.chat.completions.create`. Why is that?
+The base model can use `client.chat.completions.create`, but this is a compatibility behavior.
+### Q5: Q5: After launching the server with swift deploy using two cards, when I exit with Ctrl+C, there is always a Python process that continues to occupy the memory of one card. Is this a normal phenomenon?
+You may need to kill it. This is an issue with vllm.
+### Q6: Where to check if a model supports lmdeploy or vllm acceleration?
+Vllm and lmdeploy have their own range of supported models. Please check their respective official documentation to determine availability.
+### Q7: Why does Tongyi Qianwen 2.5-Math-7B-Instruct sometimes return garbled characters when using vllm deployment? Using vllm to deploy,fp16
+Try using bf16.
+### Q8: After starting the swift inference service, how can I set configurations like temperature interactively?
+Inference only has preset configurations at startup, while deployment can set defaults initially and allow overriding them later on the client side.
+### Q9: When deploying qwen2vl model locally, how can I input videos during inference? Can I use base64? How to call video with curl?
+base64, see [mllm client example](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client/mllm) for details.
+### Q10: When deploying qwen2-vl, I encounter an error about the vllm version not being correct?
+```text
+Unrecognized keys in `rope_scaling`for 'rope_type'='default': {'mrope_section'}
+```
+Refer to the [issue](https://github.com/QwenLM/Qwen2-VL/issues/209).
+### Q11: When using Swift deploy for inference, I want to output token probabilities. I added logprobs True, but it outputs null. What's the reason?
+```shell
+RAY_memory_monitor_refresh_ms=0 CUDA_VISIBLE_DEVICES=1 nohup swift deploy --ckpt_dir /mnt/workspace/checkpoint_600 --infer_backend vllm --logprobs True --load_data_args false --host 0.0.0.0 --port 8000 &
+```
+Parameters need to be passed from the client side, `request_config = RequestConfig(..., logprobs=True, top_logprobs=2)`.
+### Q12: Can we set request timeout time for Swift3.0 deployment inference? What happens if the image URL is invalid?
+You can set the `TIMEOUT` environment variable, which defaults to 300 seconds. Alternatively, you can pass parameters in `InferClient`.
+### Q13: Why can't I get streaming generation with Swift deployed models? I've set stream to True on both server and client side, but it's still not streaming
+It's controlled by the client side. Please check [examples/deploy/client](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client).
+### Q14: After deploying a multimodal model with Swift, is there an example of passing PIL.Image from the client?
+Check this [client example](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/client/mllm/openai_client.py).
+### Q15: When deploying, which parameter should be set to output multiple results in a single response?
+The parameter `n` in `RequestConfig`.
+## Evaluation
+### Q1: What evaluation datasets are supported by Swift?
+Pure text evaluation:
+```text
+'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada',
+'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze',
+'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval',
+'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench',
+'ARC_e', 'COPA', 'ARC_c', 'DRCD'
+```
+Multimodal evaluation:
+```text
+'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN',
+'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11',
+'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2',
+'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL',
+'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar',
+'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL',
+'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI',
+'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST',
+'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500',
+'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL',
+'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME'
+```
+See the document [Evaluation](https://swift.readthedocs.io/en/latest/Instruction/Evaluation.html) for details.
+### Q2: How to use a custom evaluation dataset?
+Custom evaluation datasets, both plain text and multimodal, must match the data format (pattern) of an official dataset. See the document [Evaluation](https://swift.readthedocs.io/en/latest/Instruction/Evaluation.html) for details.
+### Q3: Error with mmengine in python3.11 environment during evaluation
+Try using the Python 3.10 environment. Or first install all dependencies:
+`pip3 install evalscope[all]`,
+then apply the patch:
+`pip3 install https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/package/evalscope-0.5.3.post1-py3-none-any.whl`.
+### Q4: After manually downloading the official evaluation dataset, can Swift eval be configured for local path evaluation?
+First, download the evaluation dataset [eval.zip](https://modelscope.cn/datasets/swift/evalscope_resource/files), extract it, and place its contents in the `~/.cache/modelscope/media_resources/evalscope/data` folder. Then execute the `swift eval` command to use the local data.
+### Q5: Is there a bug with custom evaluation? I modified the standard example to English, but it doesn't work?
+```shell
+swift eval --model_type 'qwen2_5-1_5b-instruct' --eval_dataset no --custom_eval_config '/mnt/workspace/test_data/config_eval.json'
+```
+This relies on the nltk package, which needs to download a punkt_tab zip file. Some environments in China have unstable or failed downloads. The code has been modified to handle this issue; reference [issue](https://github.com/nltk/nltk/issues/3293).
+### Q6: The model after eval fine-tuning keeps stopping at a fixed percentage, but the vllm service seems to be running normally. The larger the model, the sooner it disconnects.
+Set the `TIMEOUT` environment variable to -1.
+### Q7: Does evalscope support multi-model comparison?
+See the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
+### Q8: Is there a custom evaluation for multimodal datasets?
+Custom evaluation for multimodal datasets can be referenced in the [documentation](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html).
+### Q9: Does ms-swift have methods to test QPS, latency, and tokens/s?
+You can try using evalscope's [Model Inference Stress Testing](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
+### Q10: Can I control the number of dataset entries during evaluation? It takes over an hour to evaluate an MMLU, which is too slow.
+Use the configuration parameter `--eval_limit`. This `--eval_limit` controls the number of entries in each subset. For example, if MMLU has over 50 subsets, and each limit is set to 10 entries, then that would be over 500 entries in total.
+### Q11: When evaluating, isn't it just having the model output an answer once and checking if it's correct? Is there a way to record or see the complete answer each time?
+For multiple-choice evaluations like ceval, the evaluation is done by calculating the logits for each option, without outputting the actual answer content. If you want to see the answer content, you can deploy the model as a service with a specified API URL for evaluation, which will evaluate based on parsing the model's output. See the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-api-service-evaluation) for details. Both methods can be made optional.
+### Q12: I want to stress test my model using evalscope and would like to use a prompt.txt file format. What should the format of this file look like?
+Configure line_by_line, see the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/parameters.html#dataset-configuration) for details.
+### Q13: How should I use the 'parallel' and 'number' parameters when conducting model inference performance testing using evalscope perf?
+`number` is the total number of requests, while `parallel` is the number of concurrent requests.
+### Q14: In swift eval, the model stops generating after 1024 tokens. How can I modify this? Setting --max_new_tokens 5000 doesn't seem to work.
+This parameter hasn't been exposed in swift yet. You can use evalscope to run it, and configure max_tokens in the model according to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html#configure-model-evaluation-parameters).
+### Q15: Does evalscope currently support benchmarks like AIME and MATH-500 for deepseek-r1?
+Yes, it does. Here are the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html).
+### Q16: I'm getting this error when using a local path for gpqa evaluation in evalscope: ValueError: BuildingConfig 'gpqa_extended' not found. Available: ['default']
+Parameter configuration:
+```shell
+ --datasets gpqa --dataset-args '{"gpqa": {"local_path": "/mnt/workspace/gpqa"} }'
+ ```
+If you want to use datasets locally, it's recommended to clone the repository from modelscope and then specify the path.
+### Q17: When evaluating the arc dataset with evalscope, I get this error. What's the reason? I'm using the local data path method.
+```text
+KeyError: 'RequestId'
+```
+```shell
+--datasets arc --dataset-args '{"arc": {"local_path": "/mnt/workspace/arc"}}'
+```
+According to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#using-local-datasets-and-models), the arc dataset needs to be downloaded using a Python script; directly cloning the repository won't work.
+### Q18: How can I load downloaded datasets locally when using opencompass backend for evaluation?
+The opencompass backend doesn't support setting `data_args`.
+### Q19: Does swift eval with --eval_backend OpenCompass not support custom datasets?
+```text
+ValueError: eval_dataset: /mnt/workspace/data.jsonl is not supported.
+eval_backend: OpenCompass supported datasets: ['C3', 'summedits', 'WiC', 'csl', 'lambada', 'mbpp', 'hellaswag', 'ARC_e', 'math', 'nq', 'race', 'MultiRC', 'cmb', 'ceval', 'GaokaoBench', 'mmlu', 'winogrande', 'tnews', 'triviaqa', 'CB', 'cluewsc', 'humaneval', 'AX_g', 'DRCD', 'RTE', 'ocnli_fc', 'gsm8k', 'obqa', 'ReCoRD', 'Xsum', 'ocnli', 'WSC', 'siqa', 'agieval', 'piqa', 'cmnli', 'cmmlu', 'eprstmt', 'storycloze', 'AX_b', 'afqmc', 'strategyqa', 'bustm', 'BoolQ', 'COPA', 'ARC_c', 'PMMEval', 'chid', 'CMRC', 'lcsts']
+```
+OpenCompass doesn't support custom datasets; use native mode for custom datasets.
+### Q20: When I run the RAGAS evaluation task from the evalscope official documentation (https://evalscope.readthedocs.io/zh-cn/latest/user_guides/backend/rageval_backend/ragas.html) locally on a single A100, it takes 10 minutes to run the two examples in the documentation. Is this normal? Are there ways to optimize the running speed?
+RAG evaluation itself is resource-intensive, and using a local critic LLM will indeed be slower as it can't handle batch requests. It's recommended to use frameworks like vllm to launch tasks.

ms-swift/docs/source_en/Instruction/GRPO.md ADDED Viewed

	@@ -0,0 +1,471 @@

+# GRPO
+Paper Links
+[DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/abs/2402.03300)
+[DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://arxiv.org/abs/2501.12948)
+environments
+```bash
+pip install math_verify # reward function
+pip install -U trl
+```
+**Dev Log**
+- **2025-05-11** — Implemented support for the **Generative Reward Model** and enabled customized reward model processing logic through the reward plugin. For more details, refer to the [Customized Reward Models](#customized-reward-models) section.
+- **2025-04-30** — The startup command for the external vLLM server has been changed to swift rollout.
+**FAQ**
+1. It is normal for the loss to approach zero during training. Refer to this [issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851) for more details.
+2. How to calculate the training steps? Refer to this [issue](https://github.com/modelscope/ms-swift/issues/3912) for more details.
+3. Why is the clip_ratio always 1? Refer to this [issue](https://github.com/huggingface/open-r1/issues/239#issuecomment-2646297851) for more details.
+## Cluster Support
+![](../../resources/grpo.png)
+The GRPO training framework supports the integration of high-performance inference engines (such as vLLM) to accelerate the sampling process, offering the following two deployment modes:
+### 1. Internal Integration Mode
+- Launch the inference service directly within the Trainer.
+- Provides two resource allocation strategies:
+  - **Colocate Mode**: Training and inference share GPU resources.
+  - **Async Mode**: Training and inference use separate GPU resources.
+### GRPO Training Resource Allocation Scheme
+| Configuration Scenario  | NPROC_PER_NODE | num_infer_workers | Resource Allocation Description       |
+|-------------------------|----------------|-------------------|---------------------------------------|
+| **Colocate**            | = Total GPUs   | = Total GPUs      | Training and inference share all GPU resources. |
+| **Async**               | = Training GPUs| = Inference GPUs  | Must satisfy: Training GPUs + Inference GPUs = Total GPUs. |
+**Note:**
+1. In Colocate mode, it is recommended to set `sleep_level=1` to release the GPU memory occupied by vLLM during model training.
+2. Total GPUs refers to the total number of visible GPU devices.
+### 2. External Service Mode
+Connect to an external vLLM inference server.
+When using this mode, configure the external vLLM server with the following parameters:
+```bash
+--vllm_server_host <Server IP> \
+--vllm_server_port <Server Port> \
+--vllm_server_timeout <Timeout> \
+```
+Deploy the vLLM server using the `swift rollout` command. Currently, only the vLLM backend is supported.
+```bash
+CUDA_VISIBLE_DEVICES=2 \
+swift rollout \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --tensor_parallel_size 2 \
+```
+The complete script can be found [here](../../../examples/train/grpo/multi_node/Qwen2_5_32B_full.sh) .
+## Reward Functions
+### Custom Reward Functions
+A reward function takes the text `completions` generated by a model and other columns from the dataset as parameters(kwargs), and scores the model's generated text. Below is an example that demonstrates how to implement a simple length-based reward function. This function will give a reward signal of 1.0 if the length of the generated text exceeds 1024; otherwise, the reward signal will be 0.0.
+```python
+from swift.plugin import ORM, orms
+class DummyLengthRewardFunction(ORM):
+    def __call__(self, completions, **kwargs):
+        return [1.0 if len(completion) > 1024 else 0.0 for completion in completions]
+orms['dummy']= DummyLengthRewardFunction
+```
+You can add this reward function in `swift/examples/train/grpo/plugin/plugin.py` and register it using the parameter `--external_plugins examples/train/grpo/plugin/plugin.py`, then specify it using the reward_funcs parameter.
+For an example of how to execute the script, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/grpo/plugin/run_external_rm.sh).
+### Built-in Reward Functions
+Swift provides five rule-based reward functions built into the system(The code can be found in swift/plugin/orm.py.)
+| Reward Function       | Paper                                                                 |
+|----------------|----------------------------------------------------------------------------|
+| accuracy       | [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via RL](https://arxiv.org/abs/2501.12948) |
+| format         | Same as above                                                                        |
+| cosine         | [Demystifying Long Chain-of-Thought Reasoning in LLMs](https://arxiv.org/abs/2502.03373) |
+| repetition     | Same as above                                                                        |
+| soft_overlong  | [Decoupled Clip and Dynamic sAmpling Policy Optimization (DAPO)](https://arxiv.org/abs/2503.14476)    |
+#### 1. **accuracy**
+This function compares the model's generated result with the solution column in the dataset to calculate an accuracy score. If the generated result matches the standard answer, the score is 1.0; otherwise, it is 0.0.
+Note: This reward function uses the math_verify library to parse the generated results and the answers in the solution, and it may only be applicable to specific mathematical datasets.
+#### 2. **format**
+The paper uses the following system prompt to enforce a fixed format for model responses:
+```
+A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think>
+```
+This function checks whether the model generates text in the format `<think>think content</think><answer>answer content</answer>`. If the generated text adheres to the format requirements, the score is 1.0; otherwise, it is 0.0.
+#### 3. **cosine**
+The paper found that training with only the accuracy reward function could lead to overly long generated sequences, affecting training performance. The cosine reward function optimizes the training process by controlling the length of the generated sequences:
+- For text that generates the correct answer, the reward value decreases as the length increases, encouraging concise responses.
+- For text that generates incorrect answers, the reward value increases as the length increases, encouraging deeper reasoning.
+A cosine function is used to smoothly adjust the reward value, ensuring that the changes are within a reasonable range. The parameters for the cosine function include the length of the generated text, the maximum length limit, and the minimum and maximum reward values.
+Parameters:
+- cosine_min_len_value_wrong (default: -0.5): Reward value corresponding to the minimum length when the answer is incorrect.
+- cosine_max_len_value_wrong (default: 0.0): Reward value corresponding to the maximum length when the answer is incorrect.
+- cosine_min_len_value_correct (default: 1.0): Reward value corresponding to the minimum length when the answer is correct.
+- cosine_max_len_value_correct (default: 0.5): Reward value corresponding to the maximum length when the answer is correct.
+- cosine_max_len (default value equal to the model's maximum generation capacity): Maximum length limit for generated text.
+#### 4. **repetition**
+This function penalizes repetition in generated text by detecting repeated n-gram patterns and assigning penalties based on the level of repetition.
+The function splits the generated text into words and extracts n-grams of a specified size (default is 3-gram). It calculates the repetition ratio based on the proportion of unique n-grams to the total number of n-grams. If the proportion of repeated n-grams is high, a significant negative reward (penalty) is applied. The penalty value is computed based on the repetition ratio and a maximum penalty value (default: -1.0).
+Parameters:
+- repetition_n_grams (default: 3): Size of the n-gram used to detect repetition.
+- repetition_max_penalty (default: -1.0): Maximum penalty value, which controls the intensity of the penalty.
+#### 5. **soft overlong punishment**
+Define the length penalty interval. Within this interval, a linear penalty of [-1, 0] is applied.
+Parameters:
+- soft_max_length: L_max in the paper, the maximum generation length of the model, default is equal to max_completion_length.
+- soft_cache_length: L_cache in the paper, controls the length penalty interval, which is defined as [soft_max_length - soft_cache_length, soft_max_length].
+Original text from the paper:
+> a length-aware penalty mechanism designed to shape the reward for truncated samples. Specifically, when the response length exceeds the predefined maximum value, we define a punishment interval. Within this interval, the longer the response, the greater the punishment it receives. This penalty is added to the original rule-based correctness reward, thereby signaling to the model to avoid excessively long responses.
+#### 6. **Reward Models**
+In addition to rule-based reward functions, this framework also supports using reward models as reward functions. When using a reward model, you need to specify the `reward_model` parameter, similar to the `model` parameter, which is used to specify the path or name of the reward model. Note that either `reward_model` or `reward_funcs` needs to be specified.
+## Arguments and Execution Script
+Arguments
+- per_device_train_batch_size: The training batch size per device. In GRPO, this refers to the batch size of completions during training.
+- per_device_eval_batch_size: The evaluation batch size per device. In GRPO, this refers to the batch size of completions during evaluation.
+- num_generations: The number of samples for each prompt, referred to as the G value in the paper, needs to be divisible by per_device_batch_size * - gradient_accumulation_steps * nproc_per_node, default is 8.
+- max_completion_length: The maximum length for sampling generation, default is 512.
+- ds3_gather_for_generation: This parameter applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible with vLLM generation. The default is True.
+- reward_funcs: Reward functions to score the results generated by the model. Includes built-in accuracy, format , cosine and repetition rule-based functions, detailed in the swift/plugin/orm.py file.
+- reward_weights: Weights for each reward function. The number should be equal to the sum of the number of reward functions and reward models. If `None`, all rewards are weighted equally with weight `1.0`.
+  - Note: If `--reward_model` is included in GRPO training, it is added to the end of the reward functions.
+- reward_model: Same as the model, using a reward model as a reward function. At least one of reward_funcs and reward_model needs to be specified.
+- reward_model_plugin: The logic for the reward model, which defaults to ORM logic. For more information, please refer to [Customized Reward Models](#customized-reward-models).
+- dataset_shuffle: Whether to shuffle the dataset randomly. Default is True.
+- loss_type: The type of loss normalization. Options are ['grpo', 'bnpo', 'dr_grpo'], default is 'grpo'. For details, see this [pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)
+- log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
+  - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
+- use_vllm: Whether to use vLLM as the back-end for sampling generation; default is False, using it is recommended to speed up training.
+- vllm_device: Device for deploying vLLM, default is auto, meaning the first unused GPU. Use cuda:x to specify a particular card.
+- vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
+- vllm_max_model_len: vLLM passthrough parameter, default is None.
+- vllm_max_num_seqs: vLLM passthrough parameter, default is 256.
+- vllm_enforce_eager: vLLM passthrough parameter, default is False.
+- vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
+- vllm_enable_prefix_caching: vLLM passthrough parameter, default is True.
+- vllm_server_host: The host address of the vLLM server. Default is None. This is used when connecting to an external vLLM server.
+- vllm_server_port: The service port of the vLLM server. Default is 8000.
+- vllm_server_timeout: The connection timeout for the vLLM server. Default is 120 seconds.
+- num_iterations: number of iterations per batch. Default is 1.
+- epsilon: epsilon value for clipping. Default is 0.2.
+- epsilon_high: Upper clip coefficient, default is None. When set, it forms a clipping range of [epsilon, epsilon_high] together with epsilon.
+- async_generate: Use async rollout to improve train speed，default `false`.
+- sleep_level: vllm specific，when both actor and rollout in the same GPU，you can make vllm sleep when model is training.
+- move_model_batches: When moving model parameters to fast inference frameworks such as vLLM/LMDeploy, determines how many batches to divide the layers into. The default is `None`, which means the entire model is not split. Otherwise, the model is split into `move_model_batches + 1` (non-layer parameters) + `1` (multi-modal component parameters) batches.
+- offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
+- offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
+  - Note: If this parameter is set to True and the grad_norm remains zero during training, please install vllm==0.7.3.
+- gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
+- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
+- dynamic_sample: Exclude data within the group where the reward standard deviation is 0, and additionally sample new data. Default is False.
+- max_resample_times: Under the dynamic_sample setting, limit the number of resampling attempts to a maximum of 3. Default is 3 times.
+- overlong_filter: Skip overlong truncated samples, which will not be included in loss calculation. Default is False.
+The hyperparameters for the reward function can be found in the [Built-in Reward Functions section](#built-in-reward-functions).
+You can use vLLM and LMDeploy as sampling backends to accelerate training.
+Multi-GPU vLLM
+```bash
+# async mode
+# The requirement is that num_infer_workers (deployment) + NPROC_PER_NODE (training) = device_count.
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=7 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-7B \
+    --reward_funcs accuracy format cosine repetition\
+    --use_vllm true \
+    --vllm_device auto \
+    --vllm_gpu_memory_utilization 0.7 \
+    --vllm_max_model_len 8192 \
+    --num_infer_workers 1 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --dataset 'AI-MO/NuminaMath-TIR#5000' \
+    --max_completion_length 2048 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-6 \
+    --gradient_accumulation_steps 2 \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 7 \
+    --temperature 0.9 \
+    --system 'examples/train/grpo/prompt.txt' \
+    --deepspeed zero2 \
+    --log_completions true
+# colocate mode
+# The requirement is that num_infer_workers (deployment) = NPROC_PER_NODE (training) = device_count.
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-1.5B \
+    --reward_funcs accuracy format \
+    --use_vllm true \
+    --vllm_device auto \
+    --vllm_gpu_memory_utilization 0.7 \
+    --vllm_max_model_len 8192 \
+    --num_infer_workers 8 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --dataset 'AI-MO/NuminaMath-TIR#5000' \
+    --max_completion_length 2048 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-6 \
+    --gradient_accumulation_steps 2 \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 4096 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 8 \
+    --temperature 0.9 \
+    --system 'examples/train/grpo/prompt.txt' \
+    --deepspeed zero2 \
+    --log_completions true \
+    --sleep_level 1 \
+    --offload_model true \
+    --offload_optimizer true \
+    --gc_collect_after_offload true \
+    --log_completions true \
+```
+Single-GPU
+```bash
+# PT backend
+CUDA_VISIBLE_DEVICES=0 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-7B \
+    --reward_funcs accuracy format cosine repetition\
+    --train_type lora \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --torch_dtype bfloat16 \
+    --dataset 'AI-MO/NuminaMath-TIR#1000' \
+    --max_completion_length 1024 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --learning_rate 1e-5 \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 4 \
+    --temperature 0.9 \
+    --system 'examples/train/grpo/prompt.txt' \
+    --log_completions true
+# vLLM backend
+CUDA_VISIBLE_DEVICES=0 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-7B \
+    --vllm_gpu_memory_utilization 0.5 \
+    --use_vllm true \
+    --sleep_level 1 \
+    --offload_model true \
+    --offload_optimizer true \
+    --gc_collect_after_offload true \
+    --reward_funcs accuracy format \
+    --train_type lora \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --torch_dtype bfloat16 \
+    --dataset 'AI-MO/NuminaMath-TIR#1000' \
+    --max_completion_length 1024 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --learning_rate 1e-5 \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 4 \
+    --temperature 0.9 \
+    --system 'examples/train/grpo/prompt.txt' \
+    --log_completions true
+```
+For multi-node training, refer to [here](../../../examples/train/grpo/multi_node/) .
+Note : In the internal integration mode, the GPU configurations and training parameters must be identical across different nodes.
+## Customized Reward Models
+By default, a reward model refers to classification models that include a value head (commonly known as Output Reward Model (ORM)). These models score the outputs of other models, producing a scalar value that represents the quality of the response.
+Currently, we can leverage the **reward_model_plugin** to flexibly customize the processing logic of these reward models. This enables the implementation of advanced techniques such as Generative Reward Models, which include:
+- Customizing the Model's System Prompt: Defining specific instructions and context to guide the evaluation process.
+- Handling Model Interaction History: Managing the conversational context to provide meaningful and contextually aware evaluations.
+- Defining Custom Evaluation Criteria: Setting unique standards and metrics for assessing the model's responses beyond default accuracy and relevance measures.
+Through the **reward_model_plugin**, developers can tailor the reward evaluation process to meet the specific requirements of their applications. This flexibility allows for more nuanced and effective reward-based training strategies.
+We provide a simple generative reward model example (GenRMPlugin) in [rm_plugin.py](../../../swift/plugin/rm_plugin.py)
+You can also customized your reward model plugin in [plugin.py](../../../examples/train/grpo/plugin/plugin.py), and register with `external_plugins` argument
+Here is an example training script to train GRPO with two reward models: one ORM and one Gen-RM (using qwen2.5-3B-Instruct in this case):
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-7B \
+    --dataset AI-MO/NuminaMath-TIR#5000 \
+    --external_plugins examples/train/grpo/plugin/plugin.py \
+    --reward_funcs format \
+    --reward_model Qwen/Qwen2.5-3B-Instruct Shanghai_AI_Laboratory/internlm2-7b-reward \
+    --reward_model_plugin genrm my_rmplugin \
+    --reward_weights 0.1 1 1 \
+    --num_infer_workers 8 \
+    --vllm_gpu_memory_utilization 0.5 \
+    --sleep_level 1 \
+    --offload_model true \
+    --offload_optimizer true \
+    --gc_collect_after_offload true \
+    --log_completions true \
+    --deepspeed zero3
+```
+Notes:
+1. In the GRPOTrainer, reward_model instances are appended sequentially to reward_funcs. Therefore, the order of reward_weights corresponds to [reward_funcs, reward_model].
+2. The default value for reward_model_plugin is default, which uses the ORM processing logic.
+## DAPO
+Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO) introduces several tricks based on GRPO, which are:
+- Clip Higher
+- Dynamic Sampling
+- Overlong Filtering
+- Token level Loss
+- Soft Overlong Punishment
+Among these, Token level Loss is implemented by default and does not require additional settings. For the other tricks, we can achieve the desired setup based on GRPOTrainer by configuring the following parameters.
+| Parameter                 | Type      | Value      |
+|----------------------|-----------|-------------|
+| `--epsilon_high`     | `float`   | `0.28`      |
+| `--dynamic_sample`   | `bool`    | `true`      |
+| `--overlong_filter`  | `bool`    | `true`      |
+| `--reward_funcs`     | `str`     | `soft_overlong`|
+| `--max_resample_times` | `int`    | `3`        |
+Reference training script (for 8-card colocate mode):
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+NPROC_PER_NODE=8 \
+WANDB_API_KEY=xxx \
+swift rlhf \
+    --rlhf_type grpo \
+    --model Qwen/Qwen2.5-1.5B \
+    --reward_funcs accuracy soft_overlong \
+    --max_completion_length 4096 \
+    --soft_cache_length 819 \
+    --epsilon 0.2 \
+    --epsilon_high 0.28 \
+    --dynamic_sample true \
+    --overlong_filter true \
+    --max_resample_times 3 \
+    --use_vllm true \
+    --vllm_gpu_memory_utilization 0.6 \
+    --num_infer_workers 8 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --dataset AI-MO/NuminaMath-TIR#5000 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --learning_rate 1e-6 \
+    --eval_steps 1000 \
+    --save_steps 1000 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_generations 8 \
+    --temperature 1.0 \
+    --top_p 1.0 \
+    --deepspeed zero2 \
+    --log_completions true \
+    --num_iterations 1 \
+    --report_to tensorboard wandb \
+    --beta 0.0 \
+    --sleep_level 1 \
+    --offload_model true \
+    --offload_optimizer true \
+    --gc_collect_after_offload true \
+    --log_completions true
+```

ms-swift/docs/source_en/Instruction/Inference-and-deployment.md ADDED Viewed

	@@ -0,0 +1,354 @@

+# Inference and Deployment
+Below are the inference engines supported by Swift along with their corresponding capabilities. The three inference acceleration engines provide inference acceleration for Swift's inference, deployment, and evaluation modules:
+| Inference Acceleration Engine                    | OpenAI API                                                   | Multimodal                                                   | Quantized Model | Multiple LoRAs                                               | QLoRA | Batch Inference                                              | Parallel Techniques |
+| ------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | --------------- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ | ------------------- |
+| pytorch                                          | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/client/llm/chat/openai_client.py) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/app/mllm.sh) | ✅               | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_lora.py) | ✅     | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/batch_ddp.sh) | DDP/device_map      |
+| [vllm](https://github.com/vllm-project/vllm)     | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/mllm_tp.sh) | ✅               | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/deploy/lora/server.sh) | ❌     | ✅                                                            | TP/PP/DP               |
+| [lmdeploy](https://github.com/InternLM/lmdeploy) | ✅                                                            | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/infer/lmdeploy/mllm_tp.sh) | ✅               | ❌                                                            | ❌     | ✅                                                            | TP/DP                  |
+## Inference
+ms-swift uses a layered design philosophy, allowing users to perform inference through the command-line interface, web UI, or directly using Python.
+To view the inference of a model fine-tuned with LoRA, please refer to the [Pre-training and Fine-tuning documentation](./Pre-training-and-Fine-tuning.md#inference-fine-tuned-model).
+### Using CLI
+**Full Parameter Model:**
+```shell
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048
+```
+**LoRA Model:**
+```shell
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --adapters swift/test_lora \
+    --stream true \
+    --infer_backend pt \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+**Command-Line Inference Instructions**
+The above commands are for interactive command-line interface inference. After running the script, you can simply enter your query in the terminal. You can also input the following special commands:
+- `multi-line`: Switch to multi-line mode, allowing line breaks in the input, ending with `#`.
+- `single-line`: Switch to single-line mode, with line breaks indicating the end of input.
+- `reset-system`: Reset the system and clear history.
+- `clear`: Clear the history.
+- `quit` or `exit`: Exit the conversation.
+**Multimodal Model**
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+MAX_PIXELS=1003520 \
+VIDEO_MAX_PIXELS=50176 \
+FPS_MAX_FRAMES=12 \
+swift infer \
+    --model Qwen/Qwen2.5-VL-3B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048
+```
+To perform inference with a multimodal model, you can add tags like `<image>`, `<video>`, or `<audio>` in your query (representing the location of image representations in `inputs_embeds`). For example, you can input `<image><image>What is the difference between these two images?` or `<video>Describe this video.` Then, follow the prompts to input the corresponding image/video/audio.
+Here is an example of inference:
+```
+<<< <image><image>What is the difference between these two images?
+Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
+Input an image path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png
+The first image depicts a cute, cartoon-style kitten with large, expressive eyes and a fluffy white and gray coat. The background is simple, featuring a gradient of colors that highlight the kitten's face.
+The second image shows a group of four cartoon-style sheep standing on a grassy field with mountains in the background. The sheep have fluffy white wool, black legs, and black faces with white markings around their eyes and noses. The background includes green hills and a blue sky with clouds, giving it a pastoral and serene atmosphere.
+--------------------------------------------------
+<<< clear
+<<< <video>Describe this video.
+Input a video path or URL <<< https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4
+A baby wearing glasses is sitting on a bed and reading a book. The baby is holding the book with both hands and is looking down at it. The baby is wearing a light blue shirt and pink pants. The baby is sitting on a white pillow. The baby is looking at the book with interest. The baby is not moving much, just turning the pages of the book.
+```
+**Dataset Inference:**
+```
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --val_dataset AI-ModelScope/alpaca-gpt4-data-zh \
+    --max_new_tokens 2048
+```
+The above example provides streaming inference for both full parameters and LoRA, and below are more inference techniques available in SWIFT:
+- Interface Inference: You can change `swift infer` to `swift app`.
+- Batch Inference: For large models and multimodal models, you can specify `--max_batch_size` for batch inference by using `infer_backend=pt`. For specific details, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/batch_ddp.sh). Note that you cannot set `--stream true` when performing batch inference.
+- DDP/device_map Inference: `infer_backend=pt` supports parallel inference using DDP/device_map technology. For further details, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/mllm_device_map.sh).
+- Inference Acceleration: Swift supports using vllm/lmdeploy for inference acceleration across the inference, deployment, and evaluation modules by simply adding `--infer_backend vllm/lmdeploy`. You can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/ddp.sh).
+- Multimodal Models: We provide shell scripts for multi-GPU inference for multimodal models using [pt](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/mllm_device_map.sh), [vllm](https://github.com/modelscope/ms-swift/blob/main/examples/infer/vllm/mllm_tp.sh), and [lmdeploy](https://github.com/modelscope/ms-swift/blob/main/examples/infer/lmdeploy/mllm_tp.sh).
+- Quantized Models: You can directly select models that are quantized with GPTQ, AWQ, or BNB, for example: `--model Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4`.
+- More Model Types: We also provide inference scripts for [bert](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/bert.sh), [reward_model](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/reward_model.sh), and [prm](https://github.com/modelscope/ms-swift/blob/main/examples/infer/pt/prm.sh).
+**Tips:**
+- SWIFT saves inference results, and you can specify the save path using `--result_path`.
+- To output log probabilities, simply specify `--logprobs true` during inference. SWIFT will save these results. Note that setting `--stream true` will prevent storage of results.
+- Using `infer_backend=pt` supports inference for all models supported by SWIFT, while `infer_backend=vllm/lmdeploy` supports only a subset of models. Please refer to the documentation for [vllm](https://docs.vllm.ai/en/latest/models/supported_models.html) and [lmdeploy](https://lmdeploy.readthedocs.io/en/latest/supported_models/supported_models.html).
+- If you encounter OOM when using `--infer_backend vllm`, you can lower `--max_model_len`, `--max_num_seqs`, choose an appropriate `--gpu_memory_utilization`, or set `--enforce_eager true`. Alternatively, you can address this by using tensor parallelism with `--tensor_parallel_size`.
+- When inferring multimodal models using `--infer_backend vllm`, you need to input multiple images. You can set `--limit_mm_per_prompt` to resolve this, for example: `--limit_mm_per_prompt '{"image": 10, "video": 5}'`.
+- If you encounter OOM issues while inferring qwen2-vl/qwen2.5-vl, you can address this by setting `MAX_PIXELS`, `VIDEO_MAX_PIXELS`, and `FPS_MAX_FRAMES`. For more information, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/app/mllm.sh).
+- SWIFT's built-in dialogue templates align with dialogue templates run using transformers. You can refer to [here](https://github.com/modelscope/ms-swift/blob/main/tests/test_align/test_template/test_vision.py) for testing. If there are any misalignments, please feel free to submit an issue or PR for correction.
+### Using Web-UI
+If you want to perform inference through a graphical interface, you can refer to the [Web-UI documentation](../GetStarted/Web-UI.md).
+### Using Python
+**Text Model:**
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+from swift.llm import PtEngine, RequestConfig, InferRequest
+model = 'Qwen/Qwen2.5-0.5B-Instruct'
+# Load the inference engine
+engine = PtEngine(model, max_batch_size=2)
+request_config = RequestConfig(max_tokens=512, temperature=0)
+# Using 2 infer_requests to demonstrate batch inference
+infer_requests = [
+    InferRequest(messages=[{'role': 'user', 'content': 'Who are you?'}]),
+    InferRequest(messages=[{'role': 'user', 'content': 'Where is the capital of Zhejiang?'},
+                           {'role': 'assistant', 'content': 'The capital of Zhejiang Province, China, is Hangzhou.'},
+                           {'role': 'user', 'content': 'What are some fun places here?'}]),
+]
+resp_list = engine.infer(infer_requests, request_config)
+query0 = infer_requests[0].messages[0]['content']
+print(f'response0: {resp_list[0].choices[0].message.content}')
+print(f'response1: {resp_list[1].choices[0].message.content}')
+```
+**Multimodal Model:**
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ['MAX_PIXELS'] = '1003520'
+os.environ['VIDEO_MAX_PIXELS'] = '50176'
+os.environ['FPS_MAX_FRAMES'] = '12'
+from swift.llm import PtEngine, RequestConfig, InferRequest
+model = 'Qwen/Qwen2.5-VL-3B-Instruct'
+# Load the inference engine
+engine = PtEngine(model, max_batch_size=2)
+request_config = RequestConfig(max_tokens=512, temperature=0)
+# Using 3 infer_requests to demonstrate batch inference
+infer_requests = [
+    InferRequest(messages=[{'role': 'user', 'content': 'Who are you?'}]),
+    InferRequest(messages=[{'role': 'user', 'content': '<image><image> What is the difference between these two images?'}],
+                 images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
+                         'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png']),
+    InferRequest(messages=[{'role': 'user', 'content': '<video> Describe the video'}],
+                 videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']),
+]
+resp_list = engine.infer(infer_requests, request_config)
+query0 = infer_requests[0].messages[0]['content']
+print(f'response0: {resp_list[0].choices[0].message.content}')
+print(f'response1: {resp_list[1].choices[0].message.content}')
+print(f'response2: {resp_list[2].choices[0].message.content}')
+```
+We also provide more demos for Python-based inference:
+- For streaming inference using `VllmEngine` and `LmdeployEngine` for inference acceleration, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py).
+- Multimodal Inference: In addition to the aforementioned multimodal input formats, Swift is compatible with OpenAI's multimodal input format; refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_mllm.py).
+- Grounding Tasks: For performing grounding tasks with multimodal models, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_grounding.py).
+- Multiple LoRA Inference: Refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_lora.py).
+- Agent Inference: Refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_agent.py).
+- Asynchronous Interface: For Python-based inference using `engine.infer_async`, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo.py).
+## Deployment
+If you want to see the deployment of a model fine-tuned with LoRA, you can refer to the [Pre-training and Fine-tuning documentation](./Pre-training-and-Fine-tuning.md#deployment-fine-tuned-model).
+This section primarily focuses on the deployment and invocation of multimodal models. For text-based large models, we provide a simple deployment and invocation example:
+**Server Deployment:**
+```shell
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend vllm \
+    --max_new_tokens 2048 \
+    --served_model_name Qwen2.5-7B-Instruct
+```
+**Client Invocation Test:**
+```shell
+curl http://localhost:8000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "Qwen2.5-7B-Instruct",
+"messages": [{"role": "user", "content": "What should I do if I can’t sleep at night?"}],
+"max_tokens": 256,
+"temperature": 0
+}'
+```
+### Server Side
+```shell
+# test environment: pip install transformers==4.49.* vllm==0.7.3
+CUDA_VISIBLE_DEVICES=0 \
+MAX_PIXELS=1003520 \
+VIDEO_MAX_PIXELS=50176 \
+FPS_MAX_FRAMES=12 \
+swift deploy \
+    --model Qwen/Qwen2.5-VL-3B-Instruct \
+    --infer_backend vllm \
+    --gpu_memory_utilization 0.9 \
+    --max_model_len 8192 \
+    --max_new_tokens 2048 \
+    --limit_mm_per_prompt '{"image": 5, "video": 2}' \
+    --served_model_name Qwen2.5-VL-3B-Instruct
+```
+### Client Side
+We introduce three methods for invoking the client: using curl, the OpenAI library, and the Swift client.
+**Method 1: curl**
+```shell
+curl http://localhost:8000/v1/chat/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "Qwen2.5-VL-3B-Instruct",
+"messages": [{"role": "user", "content": [
+    {"type": "image", "image": "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png"},
+    {"type": "image", "image": "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png"},
+    {"type": "text", "text": "What is the difference between these two images?"}
+]}],
+"max_tokens": 256,
+"temperature": 0
+}'
+```
+**Method 2: OpenAI Library**
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='EMPTY',
+    base_url=f'http://127.0.0.1:8000/v1',
+)
+model = client.models.list().data[0].id
+print(f'model: {model}')
+messages = [{'role': 'user', 'content': [
+    {'type': 'video', 'video': 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'},
+    {'type': 'text', 'text': 'describe the video'}
+]}]
+resp = client.chat.completions.create(model=model, messages=messages, max_tokens=512, temperature=0)
+query = messages[0]['content']
+response = resp.choices[0].message.content
+print(f'query: {query}')
+print(f'response: {response}')
+# Using base64
+import base64
+import requests
+resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
+base64_encoded = base64.b64encode(resp.content).decode('utf-8')
+messages = [{'role': 'user', 'content': [
+    {'type': 'video', 'video': f'data:video/mp4;base64,{base64_encoded}'},
+    {'type': 'text', 'text': 'describe the video'}
+]}]
+gen = client.chat.completions.create(model=model, messages=messages, stream=True, temperature=0)
+print(f'query: {query}\nresponse: ', end='')
+for chunk in gen:
+    if chunk is None:
+        continue
+    print(chunk.choices[0].delta.content, end='', flush=True)
+print()
+```
+**Method 3: Swift Client**
+```python
+from swift.llm import InferRequest, InferClient, RequestConfig
+from swift.plugin import InferStats
+engine = InferClient(host='127.0.0.1', port=8000)
+print(f'models: {engine.models}')
+metric = InferStats()
+request_config = RequestConfig(max_tokens=512, temperature=0)
+# Using 3 infer_requests to demonstrate batch inference
+# Supports local paths, base64, and URLs
+infer_requests = [
+    InferRequest(messages=[{'role': 'user', 'content': 'Who are you?'}]),
+    InferRequest(messages=[{'role': 'user', 'content': '<image><image> What is the difference between these two images?'}],
+                 images=['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
+                         'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png']),
+    InferRequest(messages=[{'role': 'user', 'content': '<video> Describe the video'}],
+                 videos=['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']),
+]
+resp_list = engine.infer(infer_requests, request_config, metrics=[metric])
+print(f'response0: {resp_list[0].choices[0].message.content}')
+print(f'response1: {resp_list[1].choices[0].message.content}')
+print(f'response2: {resp_list[2].choices[0].message.content}')
+print(metric.compute())
+metric.reset()
+# Using base64
+import base64
+import requests
+resp = requests.get('https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4')
+base64_encoded = base64.b64encode(resp.content).decode('utf-8')
+messages = [{'role': 'user', 'content': [
+    {'type': 'video', 'video': f'data:video/mp4;base64,{base64_encoded}'},
+    {'type': 'text', 'text': 'describe the video'}
+]}]
+infer_request = InferRequest(messages=messages)
+request_config = RequestConfig(max_tokens=512, temperature=0, stream=True)
+gen_list = engine.infer([infer_request], request_config, metrics=[metric])
+print(f'response0: ', end='')
+for chunk in gen_list[0]:
+    if chunk is None:
+        continue
+    print(chunk.choices[0].delta.content, end='', flush=True)
+print()
+print(metric.compute())
+```
+We also provide more deployment demos:
+- Multiple LoRA deployment and invocation: Refer to [this link](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/lora).
+- Deployment and invocation of the Base model: Refer to [this link](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/client/llm/base).
+- More model types: We provide deployment scripts for [bert](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/bert) and [reward_model](https://github.com/modelscope/ms-swift/tree/main/examples/deploy/reward_model).

ms-swift/docs/source_en/Instruction/Megatron-SWIFT-Training.md ADDED Viewed

	@@ -0,0 +1,305 @@

+# Megatron-SWIFT Training
+SWIFT incorporates Megatron's parallelization techniques to accelerate the training of large models, including data parallelism, tensor parallelism, pipeline parallelism, sequence parallelism, context parallelism, and expert parallelism. It supports the pre-training and fine-tuning of models such as Qwen3, [Qwen3-MoE](https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/qwen3_moe.sh), Qwen2.5, Llama3, and the Deepseek-R1 distillation series. For a complete list of supported models, please refer to the [Supported Models and Datasets documentation](./Supported-models-and-datasets.md).
+## Environment Setup
+To use Megatron-SWIFT, in addition to installing the `swift` dependencies, you also need to install the following:
+```shell
+# Recommended PyTorch version: 2.5 / 2.6
+pip install pybind11
+# transformer_engine
+# If an installation error occurs, you can refer to this issue for resolution: https://github.com/modelscope/ms-swift/issues/3793
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+# apex
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+# megatron-core
+pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.12.0
+```
+Alternatively, you can also use the image:
+```
+modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.3-modelscope1.25.0-swift3.3.0.post1
+modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py311-torch2.6.0-vllm0.8.3-modelscope1.25.0-swift3.3.0.post1
+```
+The training module in the dependent library Megatron-LM will be cloned and installed by swift via `git clone`. Alternatively, you can use the environment variable `MEGATRON_LM_PATH` to point to the path of an already downloaded repository (in offline environments, use the [core_r0.12.0 branch](https://github.com/NVIDIA/Megatron-LM/tree/core_r0.12.0)).
+## Quick Start Example
+This section introduces a quick start example for fine-tuning the self-awareness of the Qwen2.5-7B-Instruct model using two 80GiB A100 GPUs. The following best practices can be completed within 10 minutes.
+First, we need to convert the weights from HF (Hugging Face) format to Megatron format:
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --to_mcore true \
+    --torch_dtype bfloat16 \
+    --output_dir Qwen2.5-7B-Instruct-mcore
+```
+Next, use the following script to start training. The required GPU memory resources are 2*80GiB:
+```shell
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+megatron sft \
+    --load Qwen2.5-7B-Instruct-mcore \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
+    --tensor_model_parallel_size 2 \
+    --micro_batch_size 4 \
+    --global_batch_size 16 \
+    --recompute_granularity selective \
+    --train_iters 100 \
+    --eval_iters 5 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-5 \
+    --lr_warmup_iters 10 \
+    --min_lr 1e-6 \
+    --save megatron_output/Qwen2.5-7B-Instruct \
+    --save_interval 100 \
+    --max_length 2048 \
+    --system 'You are a helpful assistant.' \
+    --num_workers 4 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --dataset_num_proc 4 \
+    --model_author swift \
+    --model_name swift-robot
+```
+Finally, convert the Megatron format weights back to HF format:
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --mcore_model megatron_output/Qwen2.5-7B-Instruct/vx-xxx \
+    --to_hf true \
+    --torch_dtype bfloat16 \
+    --output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx-hf
+```
+We then perform inference on the generated HF format weights:
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --model megatron_output/Qwen2.5-7B-Instruct/vx-xxx-hf \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+The inference results are as follows:
+```
+<<< who are you?
+I am a language model developed by swift, you can call me swift-robot. How can I assist you?
+```
+- For pretraining, you can use `megatron pt` instead of `megatron sft`, which will use a generative template for training.
+- **More examples**: Including packing, multi-node training, 32K context, MoE models, and pre-training, can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron).
+## Benchmark
+The speed comparison of full-parameter training for Dense/MoE models using `megatron sft` and `swift sft` on a single machine with eight A800 GPUs is shown below. The corresponding scripts can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron/benchmark).
+**Dense** Qwen2.5-14B:
+|                  | Megatron-LM | Deepspeed-ZeRO2 | Deepspeed-ZeRO3 |
+| ---------------- | ----------- | --------------- | --------------- |
+| Training Speed   | 9.04s/it    | 10.32s/it       | 10.56s/it       |
+| GPU Memory Usage | 8\*64GB      | 8\*80GB          | 8\*58GB          |
+**MoE** Qwen1.5-MoE-A2.7B:
+|                  | Megatron-LM | Deepspeed-ZeRO2 | Deepspeed-ZeRO3 |
+| ---------------- | ----------- | --------------- | --------------- |
+| Training Speed   | 2.93s/it    | 6.02s/it        | 24.30s/it       |
+| GPU Memory Usage | 8\*66GB      | 8\*72GB          | 8\*50GB          |
+## Command Line Arguments
+### Megatron Parameters
+**Training Parameters**:
+- 🔥micro_batch_size: Batch size per device, default is 1.
+- 🔥global_batch_size: Total batch size, equivalent to `micro_batch_size * data parallel size * gradient accumulation steps`. Default is 16.
+- 🔥recompute_granularity: Granularity of activation recomputation, options are 'full', 'selective'. 'full' means recomputing the entire transformer layer, while 'selective' means only recomputing the core attention part of the transformer layer. 'selective' is generally recommended. Default is 'selective'.
+- 🔥recompute_method: This parameter takes effect only when recompute_granularity is set to 'full', options are 'uniform', 'block'. Default is None.
+- 🔥recompute_num_layers: This parameter takes effect only when recompute_granularity is set to 'full'. Default is None. If `recompute_method` is set to uniform, this parameter specifies the number of transformer layers in each uniformly divided recomputation unit. For example, you can specify `--recompute_granularity full --recompute_method uniform --recompute_num_layers 4`. The larger the recompute_num_layers, the smaller the memory usage but higher computation cost. Default is None.
+- recompute_modules: Options include "core_attn", "moe_act", "layernorm", "mla_up_proj", "mlp", and "moe". The default value is `["core_attn"]`. For example, during MoE training, you can reduce memory usage by specifying `--recompute_granularity selective --recompute_modules core_attn moe`. Among these, "core_attn", "mlp", and "moe" use normal checkpointing, while "moe_act", "layernorm", and "mla_up_proj" use output-discarding checkpointing.
+  - "core_attn": Recomputes the core attention part of the Transformer layer.
+  - "mlp": Recomputes the dense MLP layer.
+  - "moe": Recomputes the MoE layer.
+  - "moe_act": Recomputes the MLP activation function part in the MoE module.
+  - "layernorm": Recomputes the input_layernorm and pre_mlp_layernorm.
+  - "mla_up_proj": Recomputes the MLA up-projection and RoPE application parts.
+- deterministic_mode: Deterministic mode, which may lead to slower training speed, default is False.
+- 🔥train_iters: Total number of training iterations, default is None.
+- 🔥log_interval: Log interval (unit: iters), default is 5.
+- tensorboard_dir: Directory where TensorBoard logs are written. Default is None, meaning logs will be stored in the `f'{save}/runs'` directory.
+- no_masked_softmax_fusion: Default is False. Disables scaling, masking, and softmax fusion for query_key_value.
+- no_bias_dropout_fusion: Default is False. Disables bias and dropout fusion.
+- no_bias_swiglu_fusion: Default is False. Specify `--no_bias_dropout_fusion true` to disable bias and swiglu fusion.
+- no_rope_fusion: Default is False. Specify `--no_rope_fusion true` to disable rope fusion.
+- no_gradient_accumulation_fusion: Default is False. Specify `--no_gradient_accumulation_fusion true` to disable gradient accumulation fusion.
+- 🔥cross_entropy_loss_fusion: Enables cross-entropy loss calculation fusion. Default is False.
+- calculate_per_token_loss: Scales the cross-entropy loss according to the number of non-padded tokens in the global batch. Default is True.
+- 🔥attention_backend: The attention backend to use (flash, fused, unfused, local, auto). Defaults to auto.
+- optimizer: Optimizer type, options are 'adam', 'sgd'. Default is adam.
+- dataloader_type: Default is 'cyclic', options are 'single', 'cyclic', 'external'. If `--streaming` is enabled, set it to external.
+- manual_gc: Disables the default garbage collector and manually triggers garbage collection. Default is False.
+- manual_gc_interval: Interval at which garbage collection is triggered. Default is 0.
+- seed: Random seed for python, numpy, pytorch, and cuda, default is 42.
+- 🔥num_workers: Number of workers for the dataloader, default is 4.
+  - Note: If `--streaming true` is set, it will be set to 1.
+seq_length: Defaults to None, meaning it is set to `max_length`. To restrict the dataset length, please use the `--max_length` parameter in the basic arguments; there is no need to set this parameter.
+- use_cpu_initialization: Initializes weights on the CPU, default is False. Used during HF and MCore weight conversion.
+- no_create_attention_mask_in_dataloader: Does not create an attention mask in the dataloader, default is True.
+**Learning Rate Parameters**:
+- 🔥lr: Initial learning rate, which will ultimately determine the learning rate for each iteration based on the warm-up and decay strategy, default is 1e-5.
+- lr_decay_style: Learning rate decay strategy, default is 'cosine'. Commonly set to 'cosine', 'linear', or 'constant'.
+- 🔥lr_decay_iters: Number of iterations for learning rate decay. Default is None, meaning it will be set to `--train_iters`.
+- 🔥lr_warmup_iters: Number of iterations for linear learning rate warm-up, default is 0.
+- 🔥min_lr: Minimum value of the learning rate, clipping any learning rate below this threshold to this value, default is 0.
+**Regularization Parameters**:
+- 🔥weight_decay: Default is 0.1.
+- 🔥clip_grad: L2 gradient clipping, default is 1.0.
+- adam_beta1: Default is 0.9.
+- adam_beta2: Default is 0.95.
+- adam_eps: Default is 1e-8.
+- sgd_momentum: Default is 0.9.
+**Checkpoint Parameters**:
+- 🔥save: Output directory for checkpoints, default is None. During training, if this parameter is not set, it defaults to `f'megatron_output/{model_suffix}'`, e.g., `'megatron_output/Qwen2.5-7B-Instruct'`.
+  - Note: When training on multiple machines, ensure that the save paths on each node point to the same location. Otherwise, you will need to manually consolidate these weights after training.
+- 🔥save_interval: Checkpoint saving interval (steps), default is 500.
+  - Note: Weights will always be saved at the end of training.
+- 🔥no_save_optim: Do not save optimizer, default is False.
+- 🔥no_save_rng: Do not save RNG, default is False.
+- 🔥load: Directory of the checkpoint to load, default is None.
+- 🔥no_load_optim: Do not load optimizer, default is False.
+- 🔥no_load_rng: Do not load RNG, default is False.
+- 🔥finetune: Load the model and fine-tune. Does not load the optimizer and random seed states from the checkpoint and resets the iteration count to 0. Default is False.
+- ckpt_format: Format of the checkpoint. Options are 'torch', 'torch_dist', 'zarr'. Default is 'torch_dist'.
+- no_initialization: Do not initialize weights, default is True.
+- auto_detect_ckpt_format: Automatically detect whether the checkpoint format is legacy or distributed. Default is True.
+- exit_on_missing_checkpoint: If `--load` is set but no checkpoint is found, exit directly instead of initializing. Default is True.
+**Distributed Parameters**:
+- distributed_backend: Distributed backend, options are 'nccl', 'gloo'. Default is nccl.
+- 🔥use_distributed_optimizer: Use a distributed optimizer. Default is True.
+- 🔥tensor_model_parallel_size: TP (Tensor Parallelism) size, default is 1.
+- 🔥pipeline_model_parallel_size: PP (Pipeline Parallelism) size, default is 1.
+- decoder_first_pipeline_num_layers: The number of Transformer layers in the first pipeline stage of the decoder. Default is None, which means the Transformer layers are evenly distributed across all pipeline stages.
+- decoder_last_pipeline_num_layers: The number of Transformer layers in the last pipeline stage of the decoder. Default is None, which means the Transformer layers are evenly distributed across all pipeline stages.
+- 🔥sequence_parallel: Enable sequence parallel optimization. Default is False.
+- 🔥context_parallel_size: CP (Context Parallelism) size, default is 1.
+- tp_comm_overlap: Overlap tensor parallel communication with GEMM (General Matrix Multiplication) kernels (to reduce communication time). Default is False.
+- overlap_grad_reduce: Overlap grad reduction operations in DDP (to reduce DP communication time). Default is False.
+- overlap_param_gather: Overlap all-gather of parameters in the distributed optimizer (to reduce DP communication time). Default is False.
+- distributed_timeout_minutes: Timeout duration for torch.distributed (in minutes), default is 60 minutes.
+**Logging Parameters**:
+- log_params_norm: Logs the norm of parameters. Default is False.
+- log_throughput: Logs throughput per GPU. Default is True.
+  - Note: In non-packing scenarios, log_throughput is not accurate because `seq_length` does not equal the actual sequence length.
+- tensorboard_log_interval: Interval (steps) for logging to TensorBoard, default is 1.
+- tensorboard_queue_size: Queue length (related to disk I/O), similar to write intervals. Default is 50.
+- log_timers_to_tensorboard: Logs timers to TensorBoard. Default is True.
+- no_log_learning_rate_to_tensorboard: Do not log learning rate to TensorBoard. Default is False.
+- log_validation_ppl_to_tensorboard: Writes validation perplexity to TensorBoard. Default is True.
+- log_memory_to_tensorboard: Writes memory logs to TensorBoard. Default is True.
+- logging_level: Logging level. Default is None.
+- wandb_project: The name of the wandb project. Defaults to '', which means ignoring wandb.
+- wandb_exp_name: The name of the wandb experiment. Defaults to ''.
+- wandb_save_dir: The local path to save wandb results. Defaults to ''.
+**Evaluation Parameters**:
+- 🔥eval_iters: Number of evaluation iterations, default is 100.
+- 🔥eval_interval: Evaluation interval (steps), default is None, meaning it will be set to save_interval.
+**Mixed Precision Parameters**:
+- fp16: FP16 mode. The default is None, and it will be set according to the model's torch_dtype. The torch_dtype is read from the config.json by default.
+- bf16: BF16 mode. The default is None, and it will be set according to the model's torch_dtype.
+- apply_query_key_layer_scaling: Scales `Q * K^T` by `1 / layer number` (e.g., divide by layer_num for layer_num-th layer). This is helpful for FP16 training. Default is None, meaning that if `--fp16` is used, it will be set to True.
+- attention_softmax_in_fp32: Uses FP32 for computations in attention_mask and softmax. Default is True.
+**Model Parameters**: (The following parameters typically do not need to be set as they will be configured based on the HF model’s config.json; users don’t need to worry about them)
+- num_layers: Number of transformer layers, default is None.
+- hidden_size: Transformer hidden size, default is None.
+- ffn_hidden_size: Hidden size of the FFN layer in the transformer. Default is None, set to `4*hidden_size`.
+- num_attention_heads: Number of transformer attention heads, default is None.
+- group_query_attention: Default is None. If `num_query_groups > 1`, group_query_attention is set to True, otherwise False.
+- num_query_groups: Default is 1.
+- max_position_embeddings: Maximum length of positional embeddings, default is None.
+- position_embedding_type: Type of positional embedding, options are 'learned_absolute', 'rope', 'relative', and 'none'. Default is 'rope'.
+- rotary_base: Default is 10000.
+- rotary_percent: Default is 1.
+- normalization: Options are 'LayerNorm', 'RMSNorm'. Default is RMSNorm.
+- norm_epsilon: Default is 1e-5.
+- swiglu: Uses swiglu instead of the default gelu. Default is True.
+- untie_embeddings_and_output_weights: Unties embedding and output weights. Default is True.
+- disable_bias_linear: Disables bias in linear layers. Default is True.
+- add_qkv_bias: Adds bias only to QKV linear layers. Default is True.
+- attention_dropout: Default is 0.
+- hidden_dropout: Default is 0.
+- kv_channels: Defaults to None, set to `args.hidden_size // args.num_attention_heads`.
+- qk_layernorm: Whether to apply layer normalization to Q and K.
+- transformer_impl: Which transformer implementation to use, options are 'local' and 'transformer_engine'. Default is transformer_engine.
+- padded_vocab_size: Full vocabulary size, default is None.
+- rope_scaling: Related parameters for rope_scaling, default is None. Refer to the format in [llama3.1 config.json](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-8B-Instruct/file/view/master?fileName=config.json&status=1). Pass the value as a JSON string.
+- model_type: The model_type in the config.json of the Huggingface model weights.
+**MoE Parameters**:
+- num_experts: The number of experts in MoE, default is None. Automatically read from config.json.
+- moe_ffn_hidden_size: The hidden layer size of the feed-forward network (ffn) for each expert. Default is None, set to ffn_hidden_size. Automatically read from config.json.
+- moe_shared_expert_intermediate_size: The total FFN hidden layer size for shared experts. If there are multiple shared experts, it should equal `num_shared_experts * ffn_size_of_each_shared_expert`. Default is None. Automatically read from config.json.
+- moe_router_topk: The number of experts each token is routed to. Default is None. Automatically read from config.json.
+- moe_router_pre_softmax: Enable pre-softmax routing for MoE, meaning that softmax will be applied before top-k selection. Default is None. Automatically read from config.json.
+- 🔥moe_aux_loss_coeff: Scaling coefficient for the auxiliary loss: the recommended initial value is 1e-2. Default is None. Automatically read from config.json.
+- 🔥expert_model_parallel_size: The degree of expert parallelism, default is 1.
+- moe_token_dispatcher_type: The type of token dispatcher to use. Options include 'allgather', 'alltoall', and 'alltoall_seq'. Default is 'alltoall'.
+- moe_grouped_gemm: When each rank contains multiple experts, improve utilization and performance by launching multiple local GEMM kernels across multiple streams using GroupedLinear in TransformerEngine. Default is False.
+- moe_router_load_balancing_type: Determines the load balancing strategy for the router. Options are "aux_loss", "seq_aux_loss", "sinkhorn", "none". Default is "aux_loss".
+- moe_z_loss_coeff: Scaling coefficient for z-loss. Default is None.
+- moe_expert_capacity_factor: Capacity factor for each expert, None means no tokens will be dropped. Default is None.
+- moe_shared_expert_overlap: Enable overlapping of shared expert computation with scheduler communication. If this option is not enabled, shared experts will execute after the routing experts. Only effective when `moe_shared_expert_intermediate_size` is set. Default is False.
+### Megatron Training Parameters
+Megatron training parameters inherit from Megatron parameters and basic parameters. For information on basic parameters, see [here](./Command-line-parameters.md#base-arguments). Additionally, the following parameters are included:
+- add_version: Adds a directory `<version>-<timestamp>` to `save` to prevent overwriting weights, default is True.
+- 🔥packing: Whether to use sequence packing, defaults to False.
+- 🔥streaming: Stream reading and processing of the dataset, default is False. It is typically set to True when handling large datasets. For more information on streaming parameters, refer to the command-line parameters documentation.
+- lazy_tokenize: Default is False. If this parameter is set to False, all dataset samples are tokenized before training (this avoids errors during training); if set to True, tokenization occurs during training (this saves memory).
+- dataloader_persistent_workers: A parameter passed directly to the dataloader, with a default value of True.
+- dataloader_prefetch_factor: A parameter passed directly to the dataloader, with a default value of 10.
+- max_epochs: Forces the training to exit after reaching `max_epochs`, and performs validation and saving of the model weights. This parameter is especially useful when using a streaming dataset. Default is None.

ms-swift/docs/source_en/Instruction/RLHF.md ADDED Viewed

	@@ -0,0 +1,114 @@

+# RLHF
+This document provides training scripts for various human preference alignment algorithms. If you want to learn more about the algorithms and how to choose them, please refer to the [documentation](https://github.com/modelscope/modelscope-classroom/blob/main/LLM-tutorial/M.%E4%BA%BA%E7%B1%BB%E5%81%8F%E5%A5%BD%E5%AF%B9%E9%BD%90%E8%AE%AD%E7%BB%83.md).
+## Dataset
+The data required by the PPO and GRPO algorithm consists solely of model inputs, which include the system prompt (optional) and the query. In the case of the GRPO algorithm, the reward function may require additional data columns. For example, to calculate accuracy, a `solution` column is needed as a reference answer.
+For RM and DPO-type algorithms such as ORPO, CPO, and SimPO, $(x,y_w,y_l)$ formatted data is required, where $x$ is the model input, $y_w$ is the preferred answer that aligns with human preferences, and $y_l$ is the rejected answer that does not align with human preferences, as shown in ![dpo_data](../../resources/dpo_data.png).
+In contrast, the KTO algorithm has a special data format that only requires $(x,y,\text{label})$, where $x$ is the model input, $y$ is the model output, and the label indicates whether the answer aligns with human preferences, as shown in ![kto_data](../../resources/kto_data.png).
+For RLHF training of text models or multimodal large models using a custom dataset, you can refer to the [custom dataset documentation](../Customization/Custom-dataset.md#rlhf).
+## GRPO
+[Paper on arXiv](https://arxiv.org/abs/2402.03300)
+Reference the training script [here](./GRPO.md).
+## DPO
+[Paper on arXiv](https://arxiv.org/abs/2305.18290)
+Hyperparameters:
+- beta: KL regularization coefficient. A larger value imposes a stronger penalty for deviation from the reference model. Default is 0.1.
+It is recommended to perform SFT training on the preferred answers from the preference dataset before starting DPO training to ensure the data meets the distribution requirements of the DPO algorithm.
+We also mixed SFT loss into the DPO loss for stable training. You can adjust the coefficient of SFT loss with the hyperparameter `rpo_alpha`, which defaults to `1.`.
+Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/dpo.sh).
+## RM
+[Paper on arXiv](https://arxiv.org/abs/2203.02155)
+Reward Modeling stage in RLHF.
+Use the base model or instruct model trained with SFT as the foundation model. Add a value head and train it using the preference dataset to create the reward model.
+The weights of the added value head will be saved in `value_head.safetensors` or `value_head.bin`.
+Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/rm.sh).
+## PPO
+[Paper on arXiv](https://arxiv.org/abs/2203.02155)
+PPO (proximal policy optimization) stage in RLHF involves four models:
+- model: The training model, either the base model or the instruct model trained with SFT.
+- ref_model: The reference model, which defaults to the model.
+- reward_model: The reward model obtained from the RM stage.
+- value_model: The value model initialized by the reward model, updated synchronously during training.
+Hyperparameters:
+- local_rollout_forward_batch_size: Batch size for each data sample, default is 64.
+- whiten_rewards: Normalize rewards, default is False.
+- kl_coef: Coefficient for the KL divergence term, default is 0.05.
+- cliprange: Clip range in the PPO policy loss function, default is 0.2.
+- vf_coef: Coefficient for the value loss function, default is 0.1.
+- cliprange_value: Clip range in the PPO value loss function, default is 0.2.
+- gamma: Discount factor for cumulative rewards, default is 1.0.
+- lam: Lambda coefficient in [GAE](https://arxiv.org/abs/1506.02438), default is 0.95.
+- num_sample_generations: Number of debugging samples generated during training, default is 10.
+Note: When training the base model, perform SFT first and then proceed to RLHF. Specify the chat template, and it is recommended to use `full` for sft_type.
+Refer to the [documentation](https://huggingface.co/docs/trl/ppov2_trainer#explanation-of-the-logged-metrics) for metric explanations during training.
+## KTO
+[Paper on arXiv](https://arxiv.org/abs/2402.01306)
+Hyperparameters:
+- beta: KL regularization coefficient. A larger value leads to a greater penalty for deviation from the reference model. Default is 0.1.
+- desirable_weight: The $\lambda_D$ term in the loss function represents the loss weight for the preferred response samples, with a default value of 1.0.
+- undesirable_weight: The $\lambda_U$ term in the loss function represents the loss weight for rejected samples, with a default value of 1.0.
+Let $n_D$ and $n_U$ represent the number of preferred and rejected samples in the dataset, respectively. For hyperparameters $\lambda_D$ and $\lambda_U$, the authors recommend setting $\frac{\lambda_D n_D}{\lambda_U n_U} \in [1, \frac{4}{3}]$.
+Training script:
+Train using data in the $(x,y,\text{label})$ format.
+Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/kto.sh).
+## CPO
+[Paper on arXiv](https://arxiv.org/abs/2401.08417)
+Hyperparameters:
+- beta: Coefficient before the implicit reward, default is 0.1.
+- cpo_alpha: Coefficient for NLL loss, default is 1.0.
+Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/cpo.sh).
+## ORPO
+[Paper on arXiv](https://arxiv.org/abs/2403.07691)
+Hyperparameters:
+- lambda: Odds Ratio loss coefficient.
+Note: ORPO uses the parameter `--beta` to pass the hyperparameter `lambda`.
+Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/orpo.sh).
+## SimPO
+[Paper on arXiv](https://arxiv.org/abs/2405.14734)
+Hyperparameters:
+- beta: Coefficient before the implicit reward, default is 2.0.
+- simpo_gamma: Reward margin term, default is 1.0.
+- cpo_alpha: The mixed CPO NLL loss for improving training stability; defaults to 1.0, set to 0.0 to use the original SimPO algorithm.
+Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/simpo.sh).

ms-swift/docs/source_en/Instruction/Reinforced-Fine-tuning.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# Reinforced Fine-Tuning
+Reinforced fine-tuning is one of the most important functionalities in current model training, with various implementations. SWIFT has already supported the atomic capabilities required for reinforced fine-tuning, such as sampling, reinforcement learning, and fine-tuning. Currently, we provide a specific example of rejection sampling fine-tuning, which can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft/rft.py).
+## Concept of Reinforced Fine-Tuning
+The concept of reinforced fine-tuning has been proposed since 2022 (or even earlier). Its general workflow typically includes the following steps:
+1. Generate data using a specific model or augment the original dataset.
+2. Train the target model using the generated data.
+3. Repeat the above process if necessary.
+**Step 1:**
+- If the data-generating model is a larger model, such as GPT, Qwen-Max, DeepSeek-V3/R1, etc., this process can be understood as distillation.
+- If the data-generating model is the same model being trained, this can be considered self-improvement fine-tuning.
+- If the sampling process involves sampling a batch, fitting the data with KL divergence and rewards, and iterating continuously, it can be classified as on-policy algorithms like PPO or GRPO.
+- Sampling algorithms include Monte Carlo sampling, do_sample, group beam search, DVTS, etc.
+- The sampling process can incorporate ORM (Outcome Reward Model), PRM (Process Reward Model), diversity filtering, language filtering, etc.
+**Step 2:**
+- If SFT (Supervised Fine-Tuning) is used, it is referred to as rejection sampling fine-tuning.
+- If reinforcement learning is used, it is called reinforcement learning fine-tuning.
+**Step 3:**
+- If distillation is performed using a larger model (e.g., Monte Carlo sampling distillation with a larger model), the process usually does not involve iterations.
+- If the same model is used for sampling or algorithms like PPO are applied, iterations are typically included.
+In general, the common approaches to reinforced fine-tuning include:
+1. **Distillation**: Sampling high-quality data in bulk from a larger model using methods like Monte Carlo or do_sample, and training a smaller model on this data.
+2. **Self-improvement**: Sampling a portion of high-quality data from the same model, filtering it, and training the model iteratively.
+3. **On-policy RL**: Using methods like PPO or GRPO for iterative training.
+The sampling process is usually much more time-consuming than the training process. If data is distilled using GPT or other large models, token costs must be considered. Thus, reinforced fine-tuning is generally a supplementary mechanism for fine-tuning, except for special cases like DeepSeek-R1.
+DeepSeek-R1 uses the GRPO algorithm to enable the emergence of CoT (Chain-of-Thought) capabilities from scratch in a base model. This method requires large-scale cluster support and sufficiently large models for capability emergence. This is not discussed in detail here, but more information can be found in the [paper analysis](https://zhuanlan.zhihu.com/p/19714987272).
+Some related papers on reinforced fine-tuning:
+- Rejection Sampling Fine-Tuning: https://arxiv.org/pdf/2308.01825
+- ReST: https://arxiv.org/pdf/2308.08998
+- B-STAR: https://arxiv.org/pdf/2412.17256
+- DeepSeekMath: https://arxiv.org/pdf/2402.03300
+- Qwen-Math-PRM: https://arxiv.org/pdf/2501.07301
+- DeepSeek-R1: https://github.com/deepseek-ai/DeepSeek-R1/tree/main
+## When to Use Reinforced Fine-Tuning
+Since LLaMA3, we have observed a very noticeable yet rarely mentioned phenomenon: when training an Instruct model using a CoT-enabled training dataset and evaluating it on the corresponding test set, the test set performance tends to degrade. For example, training `llama3.1-8b-instruct` on the GSM8K training set and evaluating the generated checkpoint on the test set reveals performance degradation.
+This phenomenon mainly arises from the issue of knowledge forgetting disaster in models. During fine-tuning by model manufacturers, a significant amount of CoT data is often included. When solving mathematical tasks, the model's capability often originates not from the math dataset itself but potentially from datasets like ARC. This inference is supported by [some works](https://zhuanlan.zhihu.com/p/19269451950). Continued training on general tasks disrupts the model's existing capabilities, leading to performance degradation.
+However, it is always correct to prioritize fine-tuning. Fine-tuning allows the model to quickly adapt to the dataset distribution at a low cost. Reinforced fine-tuning should be used under the following conditions:
+1. The model has already been fine-tuned but does not meet the requirements.
+2. Stronger CoT capabilities are needed.
+3. Base model training for general capabilities is necessary, and the original dataset no longer improves performance.
+4. The output results for corresponding queries can be relatively accurately evaluated, such as tasks with clear results (math, code) or clear processes (translation, style fitting).
+Reinforced fine-tuning heavily depends on the accuracy of reward evaluations. If the evaluations are inaccurate, the training may oscillate without progress or even degrade the model performance.
+## SWIFT Implementation
+SWIFT supports the `sample` command, which is used for model sampling. Currently supported sampling methods include:
+- **do_sample**: A sampling method for open-source models; future updates will include support for model distillation.
+  - URL sampling will also be supported in the future for large-model distillation.
+- **mcts**: Monte Carlo sampling, currently under review, with future support planned.
+- **dvts**: Currently under investigation.
+We have provided a general [RFT script](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft/rft.py). This script supports self-improvement training and allows dynamic adjustments of sampling temperature, PRM thresholds, and other hyperparameters. The training method is flexible (e.g., fine-tuning, DPO) and supports iterative retraining of the original model or continued training from the previous iteration, even loading all training states from the previous iteration. Developers can incorporate additional data filtering (e.g., ensuring rows with the same ID come from the same query), including diversity checks, language filtering, etc.
+## Experimental Results
+We used the RFT script to train and evaluate the `competition_math` dataset in the math domain. The results are as follows:
+| Model                      | MATH Score | Training Method | Iterations | Post-Training MATH Score  |
+|----------------------------|------------|-----------------|------------|---------------------------|
+| LLaMA3.1_8b               | 12.0       | SFT             | 3          | 25.2 (LLaMA3.1_8b_sft)   |
+| LLaMA3.1_8b_sft           | 25.2       | RFT             | 2          | 32.4                     |
+| LLaMA3.1_8b_instruct      | 52.2       | SFT             | 2          | 39.0                     |
+| LLaMA3.1_8b_instruct      | 52.2       | RFT             | 3          | 58                       |
+| Qwen2.5_math_7b_instruct  | 79.6       | RFT             | 2          | 83.2                     |
+As shown, applying SFT to the `competition_math` dataset resulted in significant performance degradation for the instruct model. However, RFT improved the model's capabilities, even for the state-of-the-art `Qwen2.5_math_7b_instruct` math model.
+Specifically, we tested the GSM8K metric for `Qwen2.5_math_7b_instruct`:
+| Model                      | GSM8K Score | Post-RFT GSM8K Score |
+|----------------------------|-------------|-----------------------|
+| Qwen2.5_math_7b_instruct  | 92.8        | 91.6                 |
+As shown, RFT training did not significantly change the GSM8K score, avoiding the previously mentioned performance degradation phenomenon.
+## Future Roadmap
+1. More sampling methods，MCTS for example
+2. Distill from super huge model
+3. On policy RFT like PPO

ms-swift/docs/source_en/Instruction/Sample.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# Sampling
+Sampling is one of the newly supported key capabilities of SWIFT. This feature can be understood as the practical implementation of `test-time compute`. Additionally, this capability is crucial for the implementation of RFT (Reinforcement Fine-Tuning).
+## Capability Introduction
+The sampling capability of SWIFT can be demonstrated with the following example:
+```shell
+swift sample --model LLM-Research/Meta-Llama-3.1-8B-Instruct --sampler_engine pt --num_return_sequences 5 --dataset AI-ModelScope/alpaca-gpt4-data-zh#5
+```
+A `jsonl` file with a timestamp as the filename will be generated in the `sample_output` directory of the current folder. This file should contain 25 lines, each representing a complete `messages` format data.
+For a list of sampling parameters, please refer to [here](Command-line-parameters.md).
+## Environment Setup
+```shell
+pip install ms-swift[llm] -U
+```
+Or install swift from source:
+```shell
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e '.[llm]'
+```
+## Using PRM and ORM for Result Filtering
+An important capability of sampling is supervising the process and results, which can be supported by setting additional parameters.
+```shell
+swift sample --model LLM-Research/Meta-Llama-3.1-8B-Instruct --sampler_engine lmdeploy --num_return_sequences 5 --n_best_to_keep 2 --dataset tastelikefeet/competition_math#5 --prm_model AI-ModelScope/GRM-llama3.2-3B-rewardmodel-ft --orm_model math
+```
+A `jsonl` file with a timestamp as the filename will be generated in the `sample_output` directory of the current folder. This file **will contain at most** 10 lines, each representing a complete `messages` format data.
+> The reason it contains at most 10 lines is that although 5 data points are processed in total, and 2 are kept for each data point (`n_best_to_keep`), ORM may fail some validations, and failed data will not be retained in the file.
+> Additionally, after adding `--prm_model` or `--orm_model`, the file format is slightly different and includes a `rejected_response` key, which contains the responses with the lowest PRM scores.
+## Customizing PRM or ORM
+PRM and ORM can be customized by adding a new implementation in the plugin according to the existing code. For example:
+```python
+class CustomPRM:
+    # The constructor should be parameterless
+    def __init__(self):
+        # Initialize here
+        pass
+    def __call__(self, infer_requests: List[InferRequest],  ground_truths: List[str], **kwargs) -> List[Union[float, List[float]]]:
+        ...
+prms = {'custom': CustomPRM}
+```
+Afterward, use `--prm_model custom` in the command line.
+## Memory Control
+If the sampled model and PRM are loaded into memory simultaneously, it may lead to an OOM (Out of Memory) issue. To address this, sampling can be divided into two stages:
+- **Stage 1**: Specify `--model` and `--sampler_engine` without specifying `--orm_model` and `--prm_model`. Perform sampling only and save the results to a file.
+- **Stage 2**: Specify `--sampler_engine no`, along with `--orm_model` and `--prm_model`, and also specify `--cache_files`. Perform only RM data filtering without re-sampling.
+By dividing the process into two stages, only one model is loaded at a time, avoiding OOM issues.
+## Practical Example
+Please refer to the [Reinforcement Fine-Tuning Script](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft/rft.py). This script provides a practical example of using sampling for reinforcement fine-tuning.
+> **Note:** The actual effectiveness of this script is strongly related to the quality of the model, data, and RM. Therefore, it is presented only as an example. Users should modify this script and train their own RM and generator models accordingly.
+## Sampling From Large Model
+SWIFT's sample supports using the OpenAI API to distill data with large models. Example:
+```shell
+OPENAI_API_KEY="your_api_key" \
+swift sample \
+    --sampler_type distill \
+    --sampler_engine client \
+    --model deepseek-r1 \
+    --stream true \
+    --dataset tastelikefeet/competition_math#5 \
+    --num_return_sequences 1 \
+    --temperature 0.6 \
+    --top_p 0.95 \
+    --engine_kwargs '{"base_url":"https://dashscope.aliyuncs.com/compatible-mode/v1"}'
+```
+In this example:
+`base_url` and `model` represent the API endpoint and model name, respectively. `stream` indicates the stream parameter for the request.
+Note: For Deepseek-R1 series models, the output will be formatted as:`<thinking>{reasoning_content}</thinking>\n\n<answer>{content}</answer>`.

ms-swift/docs/source_en/Instruction/Supported-models-and-datasets.md ADDED Viewed

The diff for this file is too large to render. See raw diff

ms-swift/docs/source_en/_templates/autosummary/class.rst ADDED Viewed

	@@ -0,0 +1,10 @@

+.. currentmodule:: {{ module }}
+{{ name | underline}}
+.. autoclass:: {{ name }}
+    :inherited-members:
+    :members:
+.. autogenerated from source/_templates/autosummary/class.rst

ms-swift/docs/source_en/_templates/sobolengine.rst ADDED Viewed

	@@ -0,0 +1,14 @@

+.. currentmodule:: {{ module }}
+{{ name | underline}}
+.. autoclass:: {{ name }}
+    :members:
+    :exclude-members: MAXBIT, MAXDIM
+    :undoc-members:
+..
+  autogenerated from source/_templates/sobolengine.rst
+  note it has specific options

ms-swift/examples/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# Instructions
+The example provides instructions for using SWIFT for training, inference, deployment, evaluation, and quantization. By default, the model will be downloaded from the ModelScope community.
+If you want to use the Huggingface community, you can change the command line like this:
+```shell
+...
+swift sft \
+    --model <model_id_or_path> \
+    --use_hf 1 \
+    ...
+```

ms-swift/examples/app/base_url/demo.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+if __name__ == '__main__':
+    from swift.llm import AppArguments, app_main, DeployArguments, run_deploy
+    # Here's a runnable demo provided.
+    # In a real scenario, you can simply remove the deployed context.
+    with run_deploy(
+            DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'),
+            return_url=True) as url:
+        app_main(AppArguments(model='Qwen2.5-1.5B-Instruct', base_url=url, stream=True, max_new_tokens=2048))

ms-swift/examples/app/base_url/demo.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+# You need to have a deployed model or api service first
+CUDA_VISIBLE_DEVICES=0 swift app \
+    --model '<model_name>' \
+    --base_url http://127.0.0.1:8000/v1 \
+    --stream true \
+    --max_new_tokens 2048 \
+    --lang zh

ms-swift/examples/custom/dataset.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional
+from swift.llm import DatasetMeta, ResponsePreprocessor, load_dataset, register_dataset
+class CustomPreprocessor(ResponsePreprocessor):
+    prompt = """Task: Based on the given two sentences, provide a similarity score between 0.0 and 5.0.
+Sentence 1: {text1}
+Sentence 2: {text2}
+Similarity score: """
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        return super().preprocess({
+            'query': self.prompt.format(text1=row['text1'], text2=row['text2']),
+            'response': f"{row['label']:.1f}"
+        })
+register_dataset(
+    DatasetMeta(
+        ms_dataset_id='swift/stsb',
+        hf_dataset_id='SetFit/stsb',
+        preprocess_func=CustomPreprocessor(),
+    ))
+if __name__ == '__main__':
+    dataset = load_dataset(['swift/stsb'])[0]
+    print(f'dataset: {dataset}')
+    print(f'dataset[0]: {dataset[0]}')

ms-swift/examples/custom/infer.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+# sh examples/custom/infer.sh
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --load_data_args true \
+    --infer_backend pt \
+    --max_batch_size 16 \
+    --max_new_tokens 256 \
+    --temperature 0

ms-swift/examples/custom/model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from swift.llm import (InferRequest, Model, ModelGroup, ModelMeta, PtEngine, RequestConfig, TemplateMeta,
+                       get_model_tokenizer_with_flash_attn, register_model, register_template)
+register_template(
+    TemplateMeta(
+        template_type='custom',
+        prefix=['<extra_id_0>System\n{{SYSTEM}}\n'],
+        prompt=['<extra_id_1>User\n{{QUERY}}\n<extra_id_1>Assistant\n'],
+        chat_sep=['\n']))
+register_model(
+    ModelMeta(
+        model_type='custom',
+        model_groups=[
+            ModelGroup([Model('AI-ModelScope/Nemotron-Mini-4B-Instruct', 'nvidia/Nemotron-Mini-4B-Instruct')])
+        ],
+        template='custom',
+        get_function=get_model_tokenizer_with_flash_attn,
+        ignore_patterns=['nemo']))
+if __name__ == '__main__':
+    infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
+    request_config = RequestConfig(max_tokens=512, temperature=0)
+    engine = PtEngine('AI-ModelScope/Nemotron-Mini-4B-Instruct')
+    response = engine.infer([infer_request], request_config)
+    swift_response = response[0].choices[0].message.content
+    engine.default_template.template_backend = 'jinja'
+    response = engine.infer([infer_request], request_config)
+    jinja_response = response[0].choices[0].message.content
+    assert swift_response == jinja_response, (f'swift_response: {swift_response}\njinja_response: {jinja_response}')
+    print(f'response: {swift_response}')

ms-swift/examples/deploy/agent/client.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from openai import OpenAI
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+def get_infer_request():
+    messages = [{'role': 'user', 'content': "How's the weather in Beijing today?"}]
+    tools = [{
+        'name': 'get_current_weather',
+        'description': 'Get the current weather in a given location',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'location': {
+                    'type': 'string',
+                    'description': 'The city and state, e.g. San Francisco, CA'
+                },
+                'unit': {
+                    'type': 'string',
+                    'enum': ['celsius', 'fahrenheit']
+                }
+            },
+            'required': ['location']
+        }
+    }]
+    return messages, tools
+def infer(client, model: str, messages, tools):
+    messages = messages.copy()
+    query = messages[0]['content']
+    resp = client.chat.completions.create(model=model, messages=messages, tools=tools, max_tokens=512, temperature=0)
+    response = resp.choices[0].message.content
+    print(f'query: {query}')
+    print(f'response: {response}')
+    print(f'tool_calls: {resp.choices[0].message.tool_calls}')
+    tool = '{"temperature": 32, "condition": "Sunny", "humidity": 50}'
+    print(f'tool_response: {tool}')
+    messages += [{'role': 'assistant', 'content': response}, {'role': 'tool', 'content': tool}]
+    resp = client.chat.completions.create(model=model, messages=messages, tools=tools, max_tokens=512, temperature=0)
+    response2 = resp.choices[0].message.content
+    print(f'response2: {response2}')
+# streaming
+def infer_stream(client, model: str, messages, tools):
+    messages = messages.copy()
+    query = messages[0]['content']
+    gen = client.chat.completions.create(
+        model=model, messages=messages, tools=tools, max_tokens=512, temperature=0, stream=True)
+    response = ''
+    print(f'query: {query}\nresponse: ', end='')
+    for chunk in gen:
+        delta = chunk.choices[0].delta.content
+        response += delta
+        print(delta, end='', flush=True)
+    print()
+    print(f'tool_calls: {chunk.choices[0].delta.tool_calls}')
+    tool = '{"temperature": 32, "condition": "Sunny", "humidity": 50}'
+    print(f'tool_response: {tool}')
+    messages += [{'role': 'assistant', 'content': response}, {'role': 'tool', 'content': tool}]
+    gen = client.chat.completions.create(
+        model=model, messages=messages, tools=tools, max_tokens=512, temperature=0, stream=True)
+    print(f'query: {query}\nresponse2: ', end='')
+    for chunk in gen:
+        print(chunk.choices[0].delta.content, end='', flush=True)
+    print()
+if __name__ == '__main__':
+    host: str = '127.0.0.1'
+    port: int = 8000
+    client = OpenAI(
+        api_key='EMPTY',
+        base_url=f'http://{host}:{port}/v1',
+    )
+    model = client.models.list().data[0].id
+    print(f'model: {model}')
+    messages, tools = get_infer_request()
+    infer(client, model, messages, tools)
+    infer_stream(client, model, messages, tools)

ms-swift/examples/deploy/agent/server.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend vllm \
+    --gpu_memory_utilization 0.9 \
+    --max_model_len 8192 \
+    --max_new_tokens 2048 \
+    --agent_template hermes \
+    --served_model_name Qwen2.5-7B-Instruct

ms-swift/examples/deploy/bert/client.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import List
+from swift.llm import InferClient, InferRequest
+def infer_batch(engine: InferClient, infer_requests: List[InferRequest]):
+    resp_list = engine.infer(infer_requests)
+    query0 = infer_requests[0].messages[0]['content']
+    query1 = infer_requests[1].messages[0]['content']
+    print(f'query0: {query0}')
+    print(f'response0: {resp_list[0].choices[0].message.content}')
+    print(f'query1: {query1}')
+    print(f'response1: {resp_list[1].choices[0].message.content}')
+if __name__ == '__main__':
+    engine = InferClient(host='127.0.0.1', port=8000)
+    models = engine.models
+    print(f'models: {models}')
+    infer_batch(engine, [
+        InferRequest(messages=[{
+            'role': 'user',
+            'content': '今天天气真好呀'
+        }]),
+        InferRequest(messages=[{
+            'role': 'user',
+            'content': '真倒霉'
+        }])
+    ])

ms-swift/examples/deploy/client/llm/chat/openai_client.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from openai import OpenAI
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+def infer(client, model: str, messages):
+    resp = client.chat.completions.create(model=model, messages=messages, max_tokens=512, temperature=0)
+    query = messages[0]['content']
+    response = resp.choices[0].message.content
+    print(f'query: {query}')
+    print(f'response: {response}')
+    return response
+# streaming
+def infer_stream(client, model: str, messages):
+    gen = client.chat.completions.create(model=model, messages=messages, stream=True, temperature=0)
+    print(f'messages: {messages}\nresponse: ', end='')
+    for chunk in gen:
+        print(chunk.choices[0].delta.content, end='', flush=True)
+    print()
+def run_client(host: str = '127.0.0.1', port: int = 8000):
+    client = OpenAI(
+        api_key='EMPTY',
+        base_url=f'http://{host}:{port}/v1',
+    )
+    model = client.models.list().data[0].id
+    print(f'model: {model}')
+    query = 'Where is the capital of Zhejiang?'
+    messages = [{'role': 'user', 'content': query}]
+    response = infer(client, model, messages)
+    messages.append({'role': 'assistant', 'content': response})
+    messages.append({'role': 'user', 'content': 'What delicious food is there?'})
+    infer_stream(client, model, messages)
+if __name__ == '__main__':
+    from swift.llm import run_deploy, DeployArguments
+    with run_deploy(DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1)) as port:
+        run_client(port=port)

ms-swift/examples/deploy/lora/server.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+# Since `swift/test_lora` is trained by swift and contains an `args.json` file,
+# there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --adapters lora1=swift/test_lora lora2=swift/test_lora2 \
+    --infer_backend vllm

ms-swift/examples/deploy/reward_model/client.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+from swift.llm import InferClient, InferRequest
+if __name__ == '__main__':
+    engine = InferClient(host='127.0.0.1', port=8000)
+    models = engine.models
+    print(f'models: {models}')
+    messages = [{
+        'role': 'user',
+        'content': "Hello! What's your name?"
+    }, {
+        'role': 'assistant',
+        'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?'
+    }]
+    resp_list = engine.infer([InferRequest(messages=messages)])
+    print(f'messages: {messages}')
+    print(f'response: {resp_list[0].choices[0].message.content}')

ms-swift/examples/deploy/reward_model/server.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --model Shanghai_AI_Laboratory/internlm2-1_8b-reward \
+    --infer_backend pt

ms-swift/examples/deploy/server/demo.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend vllm \
+    --served_model_name Qwen2.5-7B-Instruct
+# After the server-side deployment above is successful, use the command below to perform a client call test.
+# curl http://localhost:8000/v1/chat/completions \
+# -H "Content-Type: application/json" \
+# -d '{
+# "model": "Qwen2.5-7B-Instruct",
+# "messages": [{"role": "user", "content": "What is your name?"}],
+# "temperature": 0
+# }'

ms-swift/examples/eval/eval_url/demo.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+if __name__ == '__main__':
+    from swift.llm import EvalArguments, eval_main, run_deploy, DeployArguments
+    # Here's a runnable demo provided. Use the eval_url method for evaluation.
+    # In a real scenario, you can simply remove the deployed context.
+    print(EvalArguments.list_eval_dataset())
+    with run_deploy(
+            DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'),
+            return_url=True) as url:
+        eval_main(EvalArguments(model='Qwen2.5-1.5B-Instruct', eval_url=url, eval_dataset=['arc']))

ms-swift/examples/eval/eval_url/eval.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+# You need to have a deployed model or api service first
+swift eval \
+    --model '<model_name>' \
+    --eval_backend OpenCompass \
+    --eval_url http://127.0.0.1:8000/v1 \
+    --eval_limit 100 \
+    --eval_dataset gsm8k

ms-swift/examples/eval/train_eval/train.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+  --model "Qwen/Qwen2.5-0.5B-Instruct" \
+  --train_type "lora" \
+  --dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \
+  --torch_dtype "bfloat16" \
+  --num_train_epochs "1" \
+  --per_device_train_batch_size "1" \
+  --learning_rate "1e-4" \
+  --lora_rank "8" \
+  --lora_alpha "32" \
+  --target_modules "all-linear" \
+  --gradient_accumulation_steps "16" \
+  --save_steps "50" \
+  --save_total_limit "5" \
+  --logging_steps "5" \
+  --max_length "2048" \
+  --eval_strategy "steps" \
+  --eval_steps "5" \
+  --per_device_eval_batch_size "5" \
+  --eval_use_evalscope \
+  --eval_datasets "gsm8k" \
+  --eval_datasets_args '{"gsm8k": {"few_shot_num": 0}}' \
+  --eval_limit "10"

ms-swift/examples/eval/vlm/eval.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+CUDA_VISIBLE_DEVICES=0 \
+MAX_PIXELS=1003520 \
+swift eval \
+  --model Qwen/Qwen2-VL-2B-Instruct \
+  --infer_backend vllm \
+  --eval_limit 100 \
+  --eval_dataset realWorldQA \
+  --eval_backend VLMEvalKit

ms-swift/examples/export/ollama.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+swift export \
+    --model Qwen/Qwen2.5-1.5B-Instruct \
+    --to_ollama true \
+    --output_dir Qwen2.5-1.5B-Instruct-ollama

ms-swift/examples/export/push_to_hub.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<model-id>' \
+    --hub_token '<sdk-token>' \
+    --use_hf false

ms-swift/examples/export/quantize/awq.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --model Qwen/Qwen2.5-72B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+    --device_map cpu \
+    --quant_n_samples 256 \
+    --quant_batch_size 1 \
+    --max_length 2048 \
+    --quant_method awq \
+    --quant_bits 4 \
+    --output_dir Qwen2.5-72B-Instruct-AWQ

ms-swift/examples/export/quantize/bert/bnb.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+# merge-lora
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --adapters swift/test_bert \
+    --output_dir output/swift_test_bert_merged \
+    --merge_lora true
+# bnb quantize
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model output/swift_test_bert_merged \
+    --output_dir output/swift_test_bert_bnb_int4 \
+    --quant_bits 4 \
+    --quant_method bnb
+# infer
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model output/swift_test_bert_bnb_int4

ms-swift/examples/export/quantize/bert/gptq.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+# merge-lora
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --adapters swift/test_bert \
+    --output_dir output/swift_test_bert_merged \
+    --merge_lora true
+# gptq quantize
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model output/swift_test_bert_merged \
+    --load_data_args true \
+    --output_dir output/swift_test_bert_gptq_int4 \
+    --quant_bits 4 \
+    --quant_method gptq \
+    --max_length 512
+# infer
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model output/swift_test_bert_gptq_int4

ms-swift/examples/export/quantize/bnb.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --model Qwen/Qwen2.5-1.5B-Instruct \
+    --quant_method bnb \
+    --quant_bits 4 \
+    --bnb_4bit_quant_type nf4 \
+    --bnb_4bit_use_double_quant true \
+    --output_dir Qwen2.5-1.5B-Instruct-BNB-NF4

ms-swift/examples/export/quantize/gptq.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
+OMP_NUM_THREADS=14 \
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --model Qwen/Qwen2.5-1.5B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+    --quant_n_samples 256 \
+    --quant_batch_size 1 \
+    --max_length 2048 \
+    --quant_method gptq \
+    --quant_bits 4 \
+    --output_dir Qwen2.5-1.5B-Instruct-GPTQ-Int4

ms-swift/examples/export/quantize/mllm/awq.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+pip uninstall autoawq
+pip install git+https://github.com/casper-hansen/AutoAWQ.git --no-deps  # or "autoawq>=0.2.9"
+CUDA_VISIBLE_DEVICES=0 \
+MAX_PIXELS=1003520 \
+VIDEO_MAX_PIXELS=50176 \
+FPS_MAX_FRAMES=12 \
+swift export \
+    --model Qwen/Qwen2.5-VL-3B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'modelscope/coco_2014_caption:validation#500' \
+              'swift/VideoChatGPT:Generic#500' \
+    --quant_n_samples 256 \
+    --quant_batch_size -1 \
+    --max_length 2048 \
+    --quant_method awq \
+    --quant_bits 4 \
+    --output_dir Qwen2.5-VL-3B-Instruct-AWQ

ms-swift/examples/export/quantize/moe/awq.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+pip uninstall autoawq
+pip install git+https://github.com/casper-hansen/AutoAWQ.git --no-deps  # or "autoawq>=0.2.9"
+CUDA_VISIBLE_DEVICES=0,1 \
+swift export \
+    --model Qwen/Qwen3-30B-A3B \
+    --dataset 'swift/Qwen3-SFT-Mixin' \
+    --device_map auto \
+    --quant_n_samples 64 \
+    --quant_batch_size -1 \
+    --max_length 8192 \
+    --quant_method awq \
+    --quant_bits 4 \
+    --output_dir Qwen3-30B-A3B-AWQ

ms-swift/examples/infer/demo_reward_model.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import List
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
+    resp_list = engine.infer(infer_requests)
+    print(f'messages0: {infer_requests[0].messages}')
+    print(f'response0: {resp_list[0].choices[0].message.content}')
+if __name__ == '__main__':
+    from swift.llm import InferEngine, InferRequest, PtEngine, load_dataset
+    model = 'Shanghai_AI_Laboratory/internlm2-1_8b-reward'
+    engine = PtEngine(model, max_batch_size=64)
+    # Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
+    dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
+    print(f'dataset: {dataset}')
+    infer_requests = [InferRequest(**data) for data in dataset]
+    infer_batch(engine, infer_requests)
+    messages = [{
+        'role': 'user',
+        'content': "Hello! What's your name?"
+    }, {
+        'role': 'assistant',
+        'content': 'My name is InternLM2! A helpful AI assistant. What can I do for you?'
+    }]
+    infer_batch(engine, [InferRequest(messages=messages)])

ms-swift/examples/infer/lmdeploy/ddp.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend lmdeploy \
+    --val_dataset AI-ModelScope/alpaca-gpt4-data-zh#1000 \
+    --max_new_tokens 2048

ms-swift/examples/infer/lmdeploy/mllm_tp.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+CUDA_VISIBLE_DEVICES=0,1 \
+swift infer \
+    --model OpenGVLab/InternVL2_5-1B \
+    --infer_backend lmdeploy \
+    --val_dataset AI-ModelScope/captcha-images#1000 \
+    --tp 2 \
+    --vision_batch_size 8 \
+    --max_new_tokens 2048

ms-swift/examples/infer/pt/lora.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+# Since `swift/test_lora` is trained by swift and contains an `args.json` file,
+# there is no need to explicitly set `--model`, `--system`, etc., as they will be automatically read.
+# To disable this behavior, please set `--load_args false`.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters swift/test_lora \
+    --infer_backend pt \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048

ms-swift/examples/infer/pt/mllm_device_map.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+MAX_PIXELS=1003520 \
+swift infer \
+    --model Qwen/Qwen2.5-VL-3B-Instruct \
+    --infer_backend pt \
+    --val_dataset AI-ModelScope/LaTeX_OCR#1000 \
+    --max_batch_size 16 \
+    --max_new_tokens 512

ms-swift/examples/infer/vllm/mllm_tp.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+CUDA_VISIBLE_DEVICES=0,1 \
+MAX_PIXELS=1003520 \
+swift infer \
+    --model Qwen/Qwen2.5-VL-3B-Instruct \
+    --infer_backend vllm \
+    --val_dataset AI-ModelScope/LaTeX_OCR#1000 \
+    --gpu_memory_utilization 0.9 \
+    --tensor_parallel_size 2 \
+    --max_model_len 32768 \
+    --max_new_tokens 2048 \
+    --limit_mm_per_prompt '{"image": 5, "video": 2}'